From ad6c08d8c1045843ec564a73981ece6ec31d11a0 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 27 Jan 2025 23:52:14 +0000 Subject: cgroup/misc: Remove unused misc_cg_res_total_usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit misc_cg_res_total_usage() was added in 2021 by commit a72232eabdfc ("cgroup: Add misc cgroup controller") but has remained unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 49eef10c8e59..4bf261d41a6d 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -60,7 +60,6 @@ struct misc_cg { struct misc_res res[MISC_CG_RES_TYPES]; }; -u64 misc_cg_res_total_usage(enum misc_res_type type); int misc_cg_set_capacity(enum misc_res_type type, u64 capacity); int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount); void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount); @@ -104,11 +103,6 @@ static inline void put_misc_cg(struct misc_cg *cg) #else /* !CONFIG_CGROUP_MISC */ -static inline u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - return 0; -} - static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { return 0; -- cgit v1.2.3 From b9a49520679e98700d3d89689cc91c08a1c88c1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 Jan 2025 00:55:32 +0100 Subject: rcuref: Plug slowpath race in rcuref_put() Kernel test robot reported an "imbalanced put" in the rcuref_put() slow path, which turned out to be a false positive. Consider the following race: ref = 0 (via rcuref_init(ref, 1)) T1 T2 rcuref_put(ref) -> atomic_add_negative_release(-1, ref) # ref -> 0xffffffff -> rcuref_put_slowpath(ref) rcuref_get(ref) -> atomic_add_negative_relaxed(1, &ref->refcnt) -> return true; # ref -> 0 rcuref_put(ref) -> atomic_add_negative_release(-1, ref) # ref -> 0xffffffff -> rcuref_put_slowpath() -> cnt = atomic_read(&ref->refcnt); # cnt -> 0xffffffff / RCUREF_NOREF -> atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD)) # ref -> 0xe0000000 / RCUREF_DEAD -> return true -> cnt = atomic_read(&ref->refcnt); # cnt -> 0xe0000000 / RCUREF_DEAD -> if (cnt > RCUREF_RELEASED) # 0xe0000000 > 0xc0000000 -> WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()") The problem is the additional read in the slow path (after it decremented to RCUREF_NOREF) which can happen after the counter has been marked RCUREF_DEAD. Prevent this by reusing the return value of the decrement. Now every "final" put uses RCUREF_NOREF in the slow path and attempts the final cmpxchg() to RCUREF_DEAD. [ bigeasy: Add changelog ] Fixes: ee1ee6db07795 ("atomics: Provide rcuref - scalable reference counting") Reported-by: kernel test robot Debugged-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Closes: https://lore.kernel.org/oe-lkp/202412311453.9d7636a2-lkp@intel.com --- include/linux/rcuref.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h index 2c8bfd0f1b6b..6322d8c1c6b4 100644 --- a/include/linux/rcuref.h +++ b/include/linux/rcuref.h @@ -71,27 +71,30 @@ static inline __must_check bool rcuref_get(rcuref_t *ref) return rcuref_get_slowpath(ref); } -extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); +extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { + int cnt; + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ - if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) + cnt = atomic_sub_return_release(1, &ref->refcnt); + if (likely(cnt >= 0)) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ - return rcuref_put_slowpath(ref); + return rcuref_put_slowpath(ref, cnt); } /** -- cgit v1.2.3 From f7f6142107f0e0dd4a2b041116461a049ca18cb0 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 31 Jan 2025 16:09:29 +0900 Subject: sched_ext: Add an event, SCX_EV_SELECT_CPU_FALLBACK Add a core event, SCX_EV_SELECT_CPU_FALLBACK, which represents how many times ops.select_cpu() returns a CPU that the task can't use. __scx_add_event() is used since the caller holds an rq lock, so the preemption has already been disabled. Signed-off-by: Changwoo Min Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1d70a9867fb1..f7545430a548 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -146,6 +146,7 @@ struct sched_ext_entity { u32 weight; s32 sticky_cpu; s32 holding_cpu; + s32 selected_cpu; u32 kf_mask; /* see scx_kf_mask above */ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ atomic_long_t ops_state; -- cgit v1.2.3 From 41d88484c71cd4f659348da41b7b5b3dbd3be1f6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sun, 26 Jan 2025 09:47:27 +0200 Subject: x86/mm/pat: restore large ROX pages after fragmentation Change of attributes of the pages may lead to fragmentation of direct mapping over time and performance degradation when these pages contain executable code. With current code it's one way road: kernel tries to avoid splitting large pages, but it doesn't restore them back even if page attributes got compatible again. Any change to the mapping may potentially allow to restore large page. Add a hook to cpa_flush() path that will check if the pages in the range that were just touched can be mapped at PMD level. If the collapse at the PMD level succeeded, also attempt to collapse PUD level. The collapse logic runs only when a set_memory_ method explicitly sets CPA_COLLAPSE flag, for now this is only enabled in set_memory_rox(). CPUs don't like[1] to have to have TLB entries of different size for the same memory, but looks like it's okay as long as these entries have matching attributes[2]. Therefore it's critical to flush TLB before any following changes to the mapping. Note that we already allow for multiple TLB entries of different sizes for the same memory now in split_large_page() path. It's not a new situation. set_memory_4k() provides a way to use 4k pages on purpose. Kernel must not remap such pages as large. Re-use one of software PTE bits to indicate such pages. [1] See Erratum 383 of AMD Family 10h Processors [2] https://lore.kernel.org/linux-mm/1da1b025-cabc-6f04-bde5-e50830d1ecf0@amd.com/ [rppt@kernel.org: * s/restore/collapse/ * update formatting per peterz * use 'struct ptdesc' instead of 'struct page' for list of page tables to be freed * try to collapse PMD first and if it succeeds move on to PUD as peterz suggested * flush TLB twice: for changes done in the original CPA call and after collapsing of large pages * update commit message ] Signed-off-by: "Kirill A. Shutemov" Co-developed-by: "Mike Rapoport (Microsoft)" Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-4-rppt@kernel.org --- include/linux/vm_event_item.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index f70d0958095c..5a37cb2b6f93 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -151,6 +151,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, + DIRECT_MAP_LEVEL2_COLLAPSE, + DIRECT_MAP_LEVEL3_COLLAPSE, #endif #ifdef CONFIG_PER_VMA_LOCK_STATS VMA_LOCK_SUCCESS, -- cgit v1.2.3 From 05e555b817262b5df6aa3a73df8b3dc9d388a3b4 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 26 Jan 2025 09:47:29 +0200 Subject: execmem: add API for temporal remapping as RW and restoring ROX afterwards Using a writable copy for ROX memory is cumbersome and error prone. Add API that allow temporarily remapping of ranges in the ROX cache as writable and then restoring their read-only-execute permissions. This API will be later used in modules code and will allow removing nasty games with writable copy in alternatives patching on x86. The restoring of the ROX permissions relies on the ability of architecture to reconstruct large pages in its set_memory_rox() method. Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-6-rppt@kernel.org --- include/linux/execmem.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 64130ae19690..65655a5d1be2 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -65,6 +65,37 @@ enum execmem_range_flags { * Architectures that use EXECMEM_ROX_CACHE must implement this. */ void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); + +/** + * execmem_make_temp_rw - temporarily remap region with read-write + * permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Remaps a part of the cached large page in the ROX cache in the range + * [@ptr, @ptr + @size) as writable and not executable. The caller must + * have exclusive ownership of this range and ensure nothing will try to + * execute code in this range. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_make_temp_rw(void *ptr, size_t size); + +/** + * execmem_restore_rox - restore read-only-execute permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Restores read-only-execute permissions on a range [@ptr, @ptr + @size) + * after it was temporarily remapped as writable. Relies on architecture + * implementation of set_memory_rox() to restore mapping using large pages. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_restore_rox(void *ptr, size_t size); +#else +static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } +static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } #endif /** -- cgit v1.2.3 From c287c072332905b7d878a8aade86cfef6b396343 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 26 Jan 2025 09:47:30 +0200 Subject: module: switch to execmem API for remapping as RW and restoring ROX Instead of using writable copy for module text sections, temporarily remap the memory allocated from execmem's ROX cache as writable and restore its ROX permissions after the module is formed. This will allow removing nasty games with writable copy in alternatives patching on x86. Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-7-rppt@kernel.org --- include/linux/module.h | 8 +------- include/linux/moduleloader.h | 4 ---- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 23792d5d7b74..ddf27ede7c91 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -367,7 +367,6 @@ enum mod_mem_type { struct module_memory { void *base; - void *rw_copy; bool is_rox; unsigned int size; @@ -769,14 +768,9 @@ static inline bool is_livepatch_module(struct module *mod) void set_module_sig_enforced(void); -void *__module_writable_address(struct module *mod, void *loc); - static inline void *module_writable_address(struct module *mod, void *loc) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod || - mod->state != MODULE_STATE_UNFORMED) - return loc; - return __module_writable_address(mod, loc); + return loc; } #else /* !CONFIG_MODULES... */ diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 1f5507ba5a12..e395461d59e5 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -108,10 +108,6 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); -int module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *mod); - #ifdef CONFIG_MODULES void flush_module_init_free_work(void); #else -- cgit v1.2.3 From 602df3712979de594d268e8cbca46a93126c977d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 26 Jan 2025 09:47:32 +0200 Subject: module: drop unused module_writable_address() module_writable_address() is unused and can be removed. Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-9-rppt@kernel.org --- include/linux/module.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index ddf27ede7c91..a76928c93692 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -768,11 +768,6 @@ static inline bool is_livepatch_module(struct module *mod) void set_module_sig_enforced(void); -static inline void *module_writable_address(struct module *mod, void *loc) -{ - return loc; -} - #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -880,11 +875,6 @@ static inline bool module_is_coming(struct module *mod) { return false; } - -static inline void *module_writable_address(struct module *mod, void *loc) -{ - return loc; -} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 2d5c7ce5bf4cc27db41632f357f682d0ee4518e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Sat, 1 Feb 2025 12:38:48 +0100 Subject: HID: pidff: Add MISSING_DELAY quirk and its detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A lot of devices do not include this field, and it's seldom used in force feedback implementations. I tested about three dozen applications and none of them make use of the delay. This fixes initialization of a lot of PID wheels like Cammus, VRS, FFBeast This change has no effect on fully compliant devices Co-developed-by: Makarenko Oleg Signed-off-by: Makarenko Oleg Signed-off-by: Tomasz Pakuła Reviewed-by: Michał Kopeć Reviewed-by: Paul Dino Jones Tested-by: Paul Dino Jones Tested-by: Cristóferson Bueno Tested-by: Pablo Cisneros Signed-off-by: Jiri Kosina --- include/linux/hid.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index cdc0dc13c87f..9c3a728786c3 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1228,6 +1228,9 @@ int hid_pidff_init(struct hid_device *hid); #define hid_pidff_init NULL #endif +/* HID PIDFF quirks */ +#define HID_PIDFF_QUIRK_MISSING_DELAY BIT(0) + #define dbg_hid(fmt, ...) pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__) #define hid_err(hid, fmt, ...) \ -- cgit v1.2.3 From fc7c154e9bb3c2b98875cfc565406f4787e3b7a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Sat, 1 Feb 2025 12:38:49 +0100 Subject: HID: pidff: Add MISSING_PBO quirk and its detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices with only one axis are missing PARAMETER_BLOCK_OFFSET field for conditional effects. They can only have one axis, so we're limiting the max_axis when setting the report for those effects. Automatic detection ensures compatibility even if such device won't be explicitly defined in the kernel. Fixes initialization of VRS DirectForce PRO and possibly other devices. Changes in v6: - Fixed NULL pointer dereference. When PBO is missing, make sure not to set it anyway Co-developed-by: Makarenko Oleg Signed-off-by: Makarenko Oleg Signed-off-by: Tomasz Pakuła Reviewed-by: Michał Kopeć Reviewed-by: Paul Dino Jones Tested-by: Paul Dino Jones Tested-by: Cristóferson Bueno Tested-by: Pablo Cisneros Signed-off-by: Jiri Kosina --- include/linux/hid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 9c3a728786c3..ea7ba8e4bfe4 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1230,6 +1230,7 @@ int hid_pidff_init(struct hid_device *hid); /* HID PIDFF quirks */ #define HID_PIDFF_QUIRK_MISSING_DELAY BIT(0) +#define HID_PIDFF_QUIRK_MISSING_PBO BIT(1) #define dbg_hid(fmt, ...) pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__) -- cgit v1.2.3 From a4119108d2530747e61c7cbf52e2affd089cb1f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Sat, 1 Feb 2025 12:38:50 +0100 Subject: HID: pidff: Add PERMISSIVE_CONTROL quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this quirk, a PID device isn't required to have a strict logical_minimum of 1 for the the PID_DEVICE_CONTROL usage page. Some devices come with weird values in their device descriptors and this quirk enables their initialization even if the logical minimum of the DEVICE_CONTROL page is not 1. Fixes initialization of VRS Direct Force Pro Changes in v6: - Change quirk name to better reflect it's intention Co-developed-by: Makarenko Oleg Signed-off-by: Makarenko Oleg Signed-off-by: Tomasz Pakuła Reviewed-by: Michał Kopeć Reviewed-by: Paul Dino Jones Tested-by: Paul Dino Jones Tested-by: Cristóferson Bueno Tested-by: Pablo Cisneros Signed-off-by: Jiri Kosina --- include/linux/hid.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index ea7ba8e4bfe4..89a4dee37729 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1229,8 +1229,9 @@ int hid_pidff_init(struct hid_device *hid); #endif /* HID PIDFF quirks */ -#define HID_PIDFF_QUIRK_MISSING_DELAY BIT(0) -#define HID_PIDFF_QUIRK_MISSING_PBO BIT(1) +#define HID_PIDFF_QUIRK_MISSING_DELAY BIT(0) +#define HID_PIDFF_QUIRK_MISSING_PBO BIT(1) +#define HID_PIDFF_QUIRK_PERMISSIVE_CONTROL BIT(2) #define dbg_hid(fmt, ...) pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__) -- cgit v1.2.3 From 36de0164bbaff1484288e84ac5df5cff00580263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Sat, 1 Feb 2025 12:38:51 +0100 Subject: HID: pidff: Add hid_pidff_init_with_quirks and export as GPL symbol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This lays out a way to provide an initial set of quirks to enable before device initialization takes place. GPL symbol export needed for the possibility of building HID drivers which use this function as modules. Adding a wrapper function to ensure compatibility with the old behavior of hid_pidff_init. Signed-off-by: Tomasz Pakuła Reviewed-by: Michał Kopeć Reviewed-by: Paul Dino Jones Tested-by: Paul Dino Jones Tested-by: Cristóferson Bueno Tested-by: Pablo Cisneros Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 89a4dee37729..31dfe9ed5394 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1224,8 +1224,10 @@ void hid_quirks_exit(__u16 bus); #ifdef CONFIG_HID_PID int hid_pidff_init(struct hid_device *hid); +int hid_pidff_init_with_quirks(struct hid_device *hid, __u32 initial_quirks); #else #define hid_pidff_init NULL +#define hid_pidff_init_with_quirks NULL #endif /* HID PIDFF quirks */ -- cgit v1.2.3 From 3051bf5ec773b803c474ea556b57d678a8885be3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Sat, 1 Feb 2025 12:38:52 +0100 Subject: HID: pidff: Add FIX_WHEEL_DIRECTION quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most steering wheels simply ignore DIRECTION field, but some try to be compliant with the PID standard and use it in force calculations. Games often ignore setting this field properly and/or there can be issues with dinput8 -> wine -> SDL -> Linux API translation, and this value can be incorrect. This can lead to partial/complete loss of Force Feedback or even unexpected force reversal. Sadly, this quirk can't be detected automatically without sending out effects that would move an axis. This fixes FFB on Moza Racing devices and others where effect direction is not simply ignored. Signed-off-by: Tomasz Pakuła Reviewed-by: Michał Kopeć Reviewed-by: Paul Dino Jones Signed-off-by: Jiri Kosina --- include/linux/hid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 31dfe9ed5394..7a55accf689e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1234,6 +1234,7 @@ int hid_pidff_init_with_quirks(struct hid_device *hid, __u32 initial_quirks); #define HID_PIDFF_QUIRK_MISSING_DELAY BIT(0) #define HID_PIDFF_QUIRK_MISSING_PBO BIT(1) #define HID_PIDFF_QUIRK_PERMISSIVE_CONTROL BIT(2) +#define HID_PIDFF_QUIRK_FIX_WHEEL_DIRECTION BIT(3) #define dbg_hid(fmt, ...) pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__) -- cgit v1.2.3 From abdbf8764f4962af2a910abb3a213ecf304a73d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Sat, 1 Feb 2025 12:38:56 +0100 Subject: HID: pidff: Add PERIODIC_SINE_ONLY quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices only support SINE periodic effect although they advertise support for all PERIODIC effect in their HID descriptor. Some just do nothing when trying to play such an effect (upload goes fine), some express undefined behavior like turning to one side. This quirk forces all the periodic effects to be uploaded as SINE. This is acceptable as all these effects are similar in nature and are mostly used as rumble. SINE is the most popular with others seldom used (especially SAW_UP and SAW_DOWN). Fixes periodic effects for PXN and LITE STAR wheels Signed-off-by: Tomasz Pakuła Reviewed-by: Michał Kopeć Reviewed-by: Paul Dino Jones Tested-by: Cristóferson Bueno Signed-off-by: Jiri Kosina --- include/linux/hid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 7a55accf689e..e180679ab284 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1235,6 +1235,7 @@ int hid_pidff_init_with_quirks(struct hid_device *hid, __u32 initial_quirks); #define HID_PIDFF_QUIRK_MISSING_PBO BIT(1) #define HID_PIDFF_QUIRK_PERMISSIVE_CONTROL BIT(2) #define HID_PIDFF_QUIRK_FIX_WHEEL_DIRECTION BIT(3) +#define HID_PIDFF_QUIRK_PERIODIC_SINE_ONLY BIT(4) #define dbg_hid(fmt, ...) pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__) -- cgit v1.2.3 From 0d24d4b1da96df9fc5ff36966f40f980ef864d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Sat, 1 Feb 2025 12:39:03 +0100 Subject: HID: pidff: Move all hid-pidff definitions to a dedicated header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not clutter hid includes with stuff not needed outside of the kernel. Signed-off-by: Tomasz Pakuła Reviewed-by: Michał Kopeć Reviewed-by: Paul Dino Jones Tested-by: Paul Dino Jones Tested-by: Cristóferson Bueno Tested-by: Pablo Cisneros Signed-off-by: Jiri Kosina --- include/linux/hid.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index e180679ab284..9ca7e26ac4e9 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1222,21 +1222,6 @@ unsigned long hid_lookup_quirk(const struct hid_device *hdev); int hid_quirks_init(char **quirks_param, __u16 bus, int count); void hid_quirks_exit(__u16 bus); -#ifdef CONFIG_HID_PID -int hid_pidff_init(struct hid_device *hid); -int hid_pidff_init_with_quirks(struct hid_device *hid, __u32 initial_quirks); -#else -#define hid_pidff_init NULL -#define hid_pidff_init_with_quirks NULL -#endif - -/* HID PIDFF quirks */ -#define HID_PIDFF_QUIRK_MISSING_DELAY BIT(0) -#define HID_PIDFF_QUIRK_MISSING_PBO BIT(1) -#define HID_PIDFF_QUIRK_PERMISSIVE_CONTROL BIT(2) -#define HID_PIDFF_QUIRK_FIX_WHEEL_DIRECTION BIT(3) -#define HID_PIDFF_QUIRK_PERIODIC_SINE_ONLY BIT(4) - #define dbg_hid(fmt, ...) pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__) #define hid_err(hid, fmt, ...) \ -- cgit v1.2.3 From 51333bfbf18f730a78bbb85895f9c9e4c994d883 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 18 Jan 2025 22:10:17 +0100 Subject: usb: musb: Constify struct musb_fifo_cfg 'struct musb_fifo_cfg' are not modified in these drivers. Constifying these structures moves some data to a read-only section, so increase overall security. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 64381 5537 202 70120 111e8 drivers/usb/musb/musb_core.o After: ===== text data bss dec hex filename 64957 4929 202 70088 111c8 drivers/usb/musb/musb_core.o Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/26e6f3e25dc8e785d0034dd7e59918e455563e60.1737234596.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/musb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/musb.h b/include/linux/usb/musb.h index 3963e55e88a3..fbdef950f06c 100644 --- a/include/linux/usb/musb.h +++ b/include/linux/usb/musb.h @@ -61,7 +61,7 @@ struct musb_hdrc_eps_bits { }; struct musb_hdrc_config { - struct musb_fifo_cfg *fifo_cfg; /* board fifo configuration */ + const struct musb_fifo_cfg *fifo_cfg; /* board fifo configuration */ unsigned fifo_cfg_size; /* size of the fifo configuration */ /* MUSB configuration-specific details */ -- cgit v1.2.3 From 470cb490d1b75cf25f3139dcf0226967bcc6e217 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 13 Jan 2025 15:43:18 -0600 Subject: iio: adc: ad7173: move fwnode_irq_get_byname() call site Move the call to fwnode_irq_get_byname() from the driver-specific ad7173_fw_parse_device_config() to the shared ad_sd_init() function. The main reason for this is that we want struct ad_sigma_delta_info to be static const data that describes the actual ADC chip, not the application-specific configuration or any runtime state. Previously, this struct was being used to pass the IRQ number to the shared ad_sd_init() function. Now, this is replaced by a boolean flag that is set at compile time and the ad_sd_init() function handles looking up the IRQ number instead. This also has the added benefit that if any other drivers need to make use of this in the future, they just have to set the flag and the shared code will take care of the rest rather than duplicating the code in each driver. Signed-off-by: David Lechner Link: https://patch.msgid.link/20250113-iio-adc-ad7313-fix-non-const-info-struct-v4-1-b63be3ecac4a@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 417073c52380..f242b285081b 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -46,6 +46,7 @@ struct iio_dev; * modify or drop the sample data, it, may be NULL. * @has_registers: true if the device has writable and readable registers, false * if there is just one read-only sample data shift register. + * @has_named_irqs: Set to true if there is more than one IRQ line. * @addr_shift: Shift of the register address in the communications register. * @read_mask: Mask for the communications register having the read bit set. * @status_ch_mask: Mask for the channel number stored in status register. @@ -53,7 +54,6 @@ struct iio_dev; * be used. * @irq_flags: flags for the interrupt used by the triggered buffer * @num_slots: Number of sequencer slots - * @irq_line: IRQ for reading conversions. If 0, spi->irq will be used * @num_resetclks: Number of SPI clk cycles with MOSI=1 to reset the chip. */ struct ad_sigma_delta_info { @@ -64,13 +64,13 @@ struct ad_sigma_delta_info { int (*disable_one)(struct ad_sigma_delta *, unsigned int chan); int (*postprocess_sample)(struct ad_sigma_delta *, unsigned int raw_sample); bool has_registers; + bool has_named_irqs; unsigned int addr_shift; unsigned int read_mask; unsigned int status_ch_mask; unsigned int data_reg; unsigned long irq_flags; unsigned int num_slots; - int irq_line; unsigned int num_resetclks; }; -- cgit v1.2.3 From b944249bcea97f2f6229852ae3f05f7acdcb0681 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Jan 2025 17:57:59 +0100 Subject: fsnotify: add mount notification infrastructure This is just the plumbing between the event source (fs/namespace.c) and the event consumer (fanotify). In itself it does nothing. Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250129165803.72138-2-mszeredi@redhat.com Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 20 +++++++++++++++++++ include/linux/fsnotify_backend.h | 42 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1a9ef8f6784d..589e274adc7d 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -299,6 +299,11 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt) __fsnotify_vfsmount_delete(mnt); } +static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns) +{ + __fsnotify_mntns_delete(mntns); +} + /* * fsnotify_inoderemove - an inode is going away */ @@ -507,4 +512,19 @@ static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, NULL, NULL, NULL, 0); } +static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_ATTACH, ns, mnt); +} + +static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_DETACH, ns, mnt); +} + +static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_MOVE, ns, mnt); +} + #endif /* _LINUX_FS_NOTIFY_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0d24a21a8e60..6cd8d1d28b8b 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -59,6 +59,10 @@ #define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ +#define FS_MNT_ATTACH 0x01000000 /* Mount was attached */ +#define FS_MNT_DETACH 0x02000000 /* Mount was detached */ +#define FS_MNT_MOVE (FS_MNT_ATTACH | FS_MNT_DETACH) + /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. @@ -80,6 +84,9 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Mount namespace events */ +#define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH) + /* Content events can be used to inspect file content */ #define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ FS_ACCESS_PERM) @@ -108,6 +115,7 @@ /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ + FSNOTIFY_MNT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ @@ -298,6 +306,7 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, + FSNOTIFY_EVENT_MNT, FSNOTIFY_EVENT_ERROR, }; @@ -318,6 +327,11 @@ static inline const struct path *file_range_path(const struct file_range *range) return range->path; } +struct fsnotify_mnt { + const struct mnt_namespace *ns; + u64 mnt_id; +}; + static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { @@ -383,6 +397,24 @@ static inline struct super_block *fsnotify_data_sb(const void *data, } } +static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_MNT: + return data; + default: + return NULL; + } +} + +static inline u64 fsnotify_data_mnt_id(const void *data, int data_type) +{ + const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); + + return mnt_data ? mnt_data->mnt_id : 0; +} + static inline struct fs_error_report *fsnotify_data_error_report( const void *data, int data_type) @@ -420,6 +452,7 @@ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, FSNOTIFY_ITER_TYPE_INODE2, + FSNOTIFY_ITER_TYPE_MNTNS, FSNOTIFY_ITER_TYPE_COUNT }; @@ -429,6 +462,7 @@ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, + FSNOTIFY_OBJ_TYPE_MNTNS, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; @@ -613,8 +647,10 @@ extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); +extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns); extern void fsnotify_sb_free(struct super_block *sb); extern u32 fsnotify_get_cookie(void); +extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { @@ -928,6 +964,9 @@ static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) static inline void fsnotify_sb_delete(struct super_block *sb) {} +static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns) +{} + static inline void fsnotify_sb_free(struct super_block *sb) {} @@ -942,6 +981,9 @@ static inline u32 fsnotify_get_cookie(void) static inline void fsnotify_unmount_inodes(struct super_block *sb) {} +static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) +{} + #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ -- cgit v1.2.3 From 42e128c9d5c5b6632c3b182afb7fab0957b6c337 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 22 Jan 2025 01:25:59 +0000 Subject: tty/ldsem: Remove unused ldsem_down_write_trylock ldsem_down_write_trylock() was added in 2013 by commit 4898e640caf0 ("tty: Add timed, writer-prioritized rw semaphore") but hasn't been used. Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20250122012559.441006-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_ldisc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index af01e89074b2..c5cccc3fc1e8 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -39,7 +39,6 @@ do { \ int ldsem_down_read(struct ld_semaphore *sem, long timeout); int ldsem_down_read_trylock(struct ld_semaphore *sem); int ldsem_down_write(struct ld_semaphore *sem, long timeout); -int ldsem_down_write_trylock(struct ld_semaphore *sem); void ldsem_up_read(struct ld_semaphore *sem); void ldsem_up_write(struct ld_semaphore *sem); -- cgit v1.2.3 From 2cf424f5ac01682c93e3decfddee6282b7552f50 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 3 Feb 2025 18:52:29 +0000 Subject: mlx4: Remove unused functions The last use of mlx4_find_cached_mac() was removed in 2014 by commit 2f5bb473681b ("mlx4: Add ref counting to port MAC table for RoCE") mlx4_zone_free_entries() was added in 2014 by commit 7a89399ffad7 ("net/mlx4: Add mlx4_bitmap zone allocator") but hasn't been used. (The _unique version is used) Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Reviewed-by: Tariq Toukan Reviewed-by: Kalesh AP Link: https://patch.msgid.link/20250203185229.204279-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/linux/mlx4/device.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 27f42f713c89..87edb7a8173b 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1415,7 +1415,6 @@ int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port, bool *vlan_offload_disabled); void mlx4_handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl *ctrl, struct _rule_hw *eth_header); -int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan); -- cgit v1.2.3 From 73298c7cf1b901809d2a7f3636e4635839ff8033 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 08:52:15 -0800 Subject: rcu: Remove references to old grace-period-wait primitives The rcu_barrier_sched(), synchronize_sched(), and synchronize_rcu_bh() RCU API members have been gone for many years. This commit therefore removes non-historical instances of them. Reported-by: Joe Perches Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/rcupdate.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..3bb554723074 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -806,11 +806,9 @@ do { \ * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * - * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also - * wait for regions of code with preemption disabled, including regions of - * code with interrupts or softirqs disabled. In pre-v5.0 kernels, which - * define synchronize_sched(), only code enclosed within rcu_read_lock() - * and rcu_read_unlock() are guaranteed to be waited for. + * Both synchronize_rcu() and call_rcu() also wait for regions of code + * with preemption disabled, including regions of code with interrupts or + * softirqs disabled. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen @@ -865,11 +863,10 @@ static __always_inline void rcu_read_lock(void) * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In almost all situations, rcu_read_unlock() is immune from deadlock. - * In recent kernels that have consolidated synchronize_sched() and - * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity - * also extends to the scheduler's runqueue and priority-inheritance - * spinlocks, courtesy of the quiescent-state deferral that is carried - * out when rcu_read_unlock() is invoked with interrupts disabled. + * This deadlock immunity also extends to the scheduler's runqueue + * and priority-inheritance spinlocks, courtesy of the quiescent-state + * deferral that is carried out when rcu_read_unlock() is invoked with + * interrupts disabled. * * See rcu_read_lock() for more information. */ -- cgit v1.2.3 From 83179cd67846fae0e6aea5848c07faaa5d89a1de Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Fri, 24 Jan 2025 09:38:26 +0000 Subject: uprobes: Remove the spinlock within handle_singlestep() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a flag to track TIF_SIGPENDING is suppress temporarily during the uprobe single-step. Upon uprobe singlestep is handled and the flag is confirmed, it could resume the TIF_SIGPENDING directly without acquiring the siglock in most case, then reducing contention and improving overall performance. I've use the script developed by Andrii in [1] to run benchmark. The CPU used was Kunpeng916 (Hi1616), 4 NUMA nodes, 64 cores@2.4GHz running the kernel on next tree + the optimization for get_xol_insn_slot() [2]. before-opt ---------- uprobe-nop ( 1 cpus): 0.907 ± 0.003M/s ( 0.907M/s/cpu) uprobe-nop ( 2 cpus): 1.676 ± 0.008M/s ( 0.838M/s/cpu) uprobe-nop ( 4 cpus): 3.210 ± 0.003M/s ( 0.802M/s/cpu) uprobe-nop ( 8 cpus): 4.457 ± 0.003M/s ( 0.557M/s/cpu) uprobe-nop (16 cpus): 3.724 ± 0.011M/s ( 0.233M/s/cpu) uprobe-nop (32 cpus): 2.761 ± 0.003M/s ( 0.086M/s/cpu) uprobe-nop (64 cpus): 1.293 ± 0.015M/s ( 0.020M/s/cpu) uprobe-push ( 1 cpus): 0.883 ± 0.001M/s ( 0.883M/s/cpu) uprobe-push ( 2 cpus): 1.642 ± 0.005M/s ( 0.821M/s/cpu) uprobe-push ( 4 cpus): 3.086 ± 0.002M/s ( 0.771M/s/cpu) uprobe-push ( 8 cpus): 3.390 ± 0.003M/s ( 0.424M/s/cpu) uprobe-push (16 cpus): 2.652 ± 0.005M/s ( 0.166M/s/cpu) uprobe-push (32 cpus): 2.713 ± 0.005M/s ( 0.085M/s/cpu) uprobe-push (64 cpus): 1.313 ± 0.009M/s ( 0.021M/s/cpu) uprobe-ret ( 1 cpus): 1.774 ± 0.000M/s ( 1.774M/s/cpu) uprobe-ret ( 2 cpus): 3.350 ± 0.001M/s ( 1.675M/s/cpu) uprobe-ret ( 4 cpus): 6.604 ± 0.000M/s ( 1.651M/s/cpu) uprobe-ret ( 8 cpus): 6.706 ± 0.005M/s ( 0.838M/s/cpu) uprobe-ret (16 cpus): 5.231 ± 0.001M/s ( 0.327M/s/cpu) uprobe-ret (32 cpus): 5.743 ± 0.003M/s ( 0.179M/s/cpu) uprobe-ret (64 cpus): 4.726 ± 0.016M/s ( 0.074M/s/cpu) after-opt --------- uprobe-nop ( 1 cpus): 0.985 ± 0.002M/s ( 0.985M/s/cpu) uprobe-nop ( 2 cpus): 1.773 ± 0.005M/s ( 0.887M/s/cpu) uprobe-nop ( 4 cpus): 3.304 ± 0.001M/s ( 0.826M/s/cpu) uprobe-nop ( 8 cpus): 5.328 ± 0.002M/s ( 0.666M/s/cpu) uprobe-nop (16 cpus): 6.475 ± 0.002M/s ( 0.405M/s/cpu) uprobe-nop (32 cpus): 4.831 ± 0.082M/s ( 0.151M/s/cpu) uprobe-nop (64 cpus): 2.564 ± 0.053M/s ( 0.040M/s/cpu) uprobe-push ( 1 cpus): 0.964 ± 0.001M/s ( 0.964M/s/cpu) uprobe-push ( 2 cpus): 1.766 ± 0.002M/s ( 0.883M/s/cpu) uprobe-push ( 4 cpus): 3.290 ± 0.009M/s ( 0.823M/s/cpu) uprobe-push ( 8 cpus): 4.670 ± 0.002M/s ( 0.584M/s/cpu) uprobe-push (16 cpus): 5.197 ± 0.004M/s ( 0.325M/s/cpu) uprobe-push (32 cpus): 5.068 ± 0.161M/s ( 0.158M/s/cpu) uprobe-push (64 cpus): 2.605 ± 0.026M/s ( 0.041M/s/cpu) uprobe-ret ( 1 cpus): 1.833 ± 0.001M/s ( 1.833M/s/cpu) uprobe-ret ( 2 cpus): 3.384 ± 0.003M/s ( 1.692M/s/cpu) uprobe-ret ( 4 cpus): 6.677 ± 0.004M/s ( 1.669M/s/cpu) uprobe-ret ( 8 cpus): 6.854 ± 0.005M/s ( 0.857M/s/cpu) uprobe-ret (16 cpus): 6.508 ± 0.006M/s ( 0.407M/s/cpu) uprobe-ret (32 cpus): 5.793 ± 0.009M/s ( 0.181M/s/cpu) uprobe-ret (64 cpus): 4.743 ± 0.016M/s ( 0.074M/s/cpu) Above benchmark results demonstrates a obivious improvement in the scalability of trig-uprobe-nop and trig-uprobe-push, the peak throughput of which are from 4.5M/s to 6.4M/s and 3.3M/s to 5.1M/s individually. [1] https://lore.kernel.org/all/20240731214256.3588718-1-andrii@kernel.org [2] https://lore.kernel.org/all/20240727094405.1362496-1-liaochang1@huawei.com Acked-by: Masami Hiramatsu (Google) Acked-by: Oleg Nesterov Signed-off-by: Liao Chang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250124093826.2123675-3-liaochang1@huawei.com --- include/linux/uprobes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b1df7d792fa1..a40efdda9052 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -143,6 +143,7 @@ struct uprobe_task { struct uprobe *active_uprobe; unsigned long xol_vaddr; + bool signal_denied; struct arch_uprobe *auprobe; }; -- cgit v1.2.3 From 8ce939a0fa194939cc1f92dbd8bc1a7806e7d40a Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 21 Jan 2025 07:23:02 -0800 Subject: perf: Avoid the read if the count is already updated The event may have been updated in the PMU-specific implementation, e.g., Intel PEBS counters snapshotting. The common code should not read and overwrite the value. The PERF_SAMPLE_READ in the data->sample_type can be used to detect whether the PMU-specific value is available. If yes, avoid the pmu->read() in the common code. Add a new flag, skip_read, to track the case. Factor out a perf_pmu_read() to clean up the code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250121152303.3128733-3-kan.liang@linux.intel.com --- include/linux/perf_event.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8333f132f4a9..2d07bc1193f3 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1062,7 +1062,13 @@ struct perf_output_handle { struct perf_buffer *rb; unsigned long wakeup; unsigned long size; - u64 aux_flags; + union { + u64 flags; /* perf_output*() */ + u64 aux_flags; /* perf_aux_output*() */ + struct { + u64 skip_read : 1; + }; + }; union { void *addr; unsigned long head; -- cgit v1.2.3 From b14ff274e8aa5517ff86c94d682bf26bf8b5dcc8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:47 +0100 Subject: slab, rcu: move TINY_RCU variant of kvfree_rcu() to SLAB Following the move of TREE_RCU implementation, let's move also the TINY_RCU one for consistency and subsequent refactoring. For simplicity, remove the separate inline __kvfree_call_rcu() as TINY_RCU is not meant for high-performance hardware anyway. Declare kvfree_call_rcu() in rcupdate.h to avoid header dependency issues. Also move the kvfree_rcu_barrier() declaration to slab.h Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 5 +++++ include/linux/rcutiny.h | 36 ------------------------------------ include/linux/rcutree.h | 3 --- include/linux/slab.h | 14 ++++++++++++++ 4 files changed, 19 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..3f70d1c81444 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1082,6 +1082,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) +/* + * In mm/slab_common.c, no suitable header to include here. + */ +void kvfree_call_rcu(struct rcu_head *head, void *ptr); + #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index fe42315f667f..f519cd680228 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -90,41 +90,6 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -/* - * Add one more declaration of kvfree() here. It is - * not so straight forward to just include - * where it is defined due to getting many compile - * errors caused by that include. - */ -extern void kvfree(const void *addr); - -static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - if (head) { - call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); - return; - } - - // kvfree_rcu(one_arg) call. - might_sleep(); - synchronize_rcu(); - kvfree(ptr); -} - -static inline void kvfree_rcu_barrier(void) -{ - rcu_barrier(); -} - -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -#else -static inline void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - __kvfree_call_rcu(head, ptr); -} -#endif - void rcu_qs(void); static inline void rcu_softirq_qs(void) @@ -164,7 +129,6 @@ static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_eqs(void) { } -static inline void kfree_rcu_scheduler_running(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 27d86d912781..dbe77b5fe06e 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,12 +34,9 @@ static inline void rcu_virt_note_context_switch(void) } void synchronize_rcu_expedited(void); -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_eqs(void); -void kfree_rcu_scheduler_running(void); struct rcu_gp_oldstate { unsigned long rgos_norm; diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..bcc62e5656c3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1082,6 +1083,19 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); +#ifdef CONFIG_TINY_RCU +static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + +static inline void kfree_rcu_scheduler_running(void) { } +#else +void kvfree_rcu_barrier(void); + +void kfree_rcu_scheduler_running(void); +#endif + /** * kmalloc_size_roundup - Report allocation bucket size for the given size * -- cgit v1.2.3 From 49d5377b38aa127451cf5dc6d6ea5d9da7f465a4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:49 +0100 Subject: rcu, slab: use a regular callback function for kvfree_rcu RCU has been special-casing callback function pointers that are integers lower than 4096 as offsets of rcu_head for kvfree() instead. The tree RCU implementation no longer does that as the batched kvfree_rcu() is not a simple call_rcu(). The tiny RCU still does, and the plan is also to make tree RCU use call_rcu() for SLUB_TINY configurations. Instead of teaching tree RCU again to special case the offsets, let's remove the special casing completely. Since there's no SLOB anymore, it is possible to create a callback function that can take a pointer to a middle of slab object with unknown offset and determine the object's pointer before freeing it, so implement that as kvfree_rcu_cb(). Large kmalloc and vmalloc allocations are handled simply by aligning down to page size. For that we retain the requirement that the offset is smaller than 4096. But we can remove __is_kvfree_rcu_offset() completely and instead just opencode the condition in the BUILD_BUG_ON() check. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3f70d1c81444..23bcf71ffb06 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1025,12 +1025,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) -/* - * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kvfree_rcu()? - */ -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) - /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. @@ -1041,11 +1035,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * - * The kfree_rcu() function handles this issue. Rather than encoding a - * function address in the embedded rcu_head structure, kfree_rcu() instead - * encodes the offset of the rcu_head structure within the base structure. - * Because the functions are not allowed in the low-order 4096 bytes of - * kernel virtual memory, offsets up to 4095 bytes can be accommodated. + * The kfree_rcu() function handles this issue. In order to have a universal + * callback function handling different offsets of rcu_head, the callback needs + * to determine the starting address of the freed object, which can be a large + * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to + * page boundary for those, only offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to @@ -1087,14 +1081,18 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) */ void kvfree_call_rcu(struct rcu_head *head, void *ptr); +/* + * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the + * comment of kfree_rcu() for details. + */ #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ - if (___p) { \ - BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ - kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ - } \ + if (___p) { \ + BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ + kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ + } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ -- cgit v1.2.3 From c9f8f1242a4c3e48adc6c3cf6b31c1ffbaa49943 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:50 +0100 Subject: slab: don't batch kvfree_rcu() with SLUB_TINY kvfree_rcu() is batched for better performance except on TINY_RCU, which is a simple implementation for small UP systems. Similarly SLUB_TINY is an option intended for small systems, whether or not used together with TINY_RCU. In case SLUB_TINY is used with !TINY_RCU, it makes arguably sense to not do the batching and limit the memory footprint. It's also suboptimal to have RCU-specific #ifdefs in slab code. With that, add CONFIG_KVFREE_RCU_BATCHED to determine whether batching kvfree_rcu() implementation is used. It is not set by a user prompt, but enabled by default and disabled in case TINY_RCU or SLUB_TINY are enabled. Use the new config for #ifdef's in slab code and extend their scope to cover all code used by the batched kvfree_rcu(). For example there's no need to perform kvfree_rcu_init() if the batching is disabled. Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index bcc62e5656c3..7686054dd494 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1083,7 +1083,7 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); -#ifdef CONFIG_TINY_RCU +#ifndef CONFIG_KVFREE_RCU_BATCHED static inline void kvfree_rcu_barrier(void) { rcu_barrier(); -- cgit v1.2.3 From ad6b5b73ff565e88aca7a7d1286788d80c97ba71 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:52 -0800 Subject: rcu: fix header guard for rcu_all_qs() rcu_all_qs() is defined for !CONFIG_PREEMPT_RCU but the declaration is conditioned on CONFIG_PREEMPTION. With CONFIG_PREEMPT_LAZY, CONFIG_PREEMPTION=y does not imply CONFIG_PREEMPT_RCU=y. Decouple the two. Cc: Paul E. McKenney Reviewed-by: Frederic Weisbecker Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Ankur Arora Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/rcutree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 27d86d912781..aad586f15ed0 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -103,7 +103,7 @@ extern int rcu_scheduler_active; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); -#ifndef CONFIG_PREEMPTION +#ifndef CONFIG_PREEMPT_RCU void rcu_all_qs(void); #endif -- cgit v1.2.3 From 4dca1af414fb1f27c3350a65820cb0b91178e8fe Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:53 -0800 Subject: rcu: rename PREEMPT_AUTO to PREEMPT_LAZY Replace mentions of PREEMPT_AUTO with PREEMPT_LAZY. Also, since PREMPT_LAZY implies PREEMPTION, we can reduce the TASKS_RCU selection criteria from this: NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) to this: NEED_TASKS_RCU && PREEMPTION CC: Paul E. McKenney Reviewed-by: Frederic Weisbecker Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Ankur Arora Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/srcutiny.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 1321da803274..31b59b4be2a7 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -64,7 +64,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1); preempt_enable(); -- cgit v1.2.3 From fcf0e25ad4c8d14d2faab4d9a17040f31efce205 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:55 -0800 Subject: rcu: handle unstable rdp in rcu_read_unlock_strict() rcu_read_unlock_strict() can be called with preemption enabled which can make for an unstable rdp and a racy norm value. Fix this by dropping the preempt-count in __rcu_read_unlock() after the call to rcu_read_unlock_strict(), adjusting the preempt-count check appropriately. Suggested-by: Frederic Weisbecker Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..257e9ae34414 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -95,9 +95,9 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { - preempt_enable(); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); + preempt_enable(); } static inline int rcu_preempt_depth(void) -- cgit v1.2.3 From b459874faa7b6096982552498cab735075e53013 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Dec 2024 12:39:58 -0800 Subject: srcu: Define SRCU_READ_FLAVOR_ALL in terms of symbols This commit defines SRCU_READ_FLAVOR_ALL in terms of the SRCU_READ_FLAVOR_* definitions instead of a hexadecimal constant. Suggested-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index d7ba46e74f58..f6f779b9d9ff 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -47,7 +47,8 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). -#define SRCU_READ_FLAVOR_ALL 0x7 // All of the above. +#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ + SRCU_READ_FLAVOR_LITE) // All of the above. #ifdef CONFIG_TINY_SRCU #include -- cgit v1.2.3 From 56eb8be144c2bdb3a96a0d4365777fc64c65c5d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Dec 2024 11:13:51 -0800 Subject: srcu: Pull ->srcu_{un,}lock_count into a new srcu_ctr structure This commit prepares for array-index-free srcu_read_lock*() by moving the ->srcu_{un,}lock_count fields into a new srcu_ctr structure. This will permit ->srcu_index to be replaced by a per-CPU pointer to this structure. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index b17814c9d1c7..c794d599db5c 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -17,14 +17,19 @@ struct srcu_node; struct srcu_struct; +/* One element of the srcu_data srcu_ctrs array. */ +struct srcu_ctr { + atomic_long_t srcu_locks; /* Locks per CPU. */ + atomic_long_t srcu_unlocks; /* Unlocks per CPU. */ +}; + /* * Per-CPU structure feeding into leaf srcu_node, similar in function * to rcu_node. */ struct srcu_data { /* Read-side state. */ - atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ - atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ + struct srcu_ctr srcu_ctrs[2]; /* Locks and unlocks per CPU. */ int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ /* Values: SRCU_READ_FLAVOR_.* */ @@ -221,7 +226,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ return idx; } @@ -240,7 +245,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ + this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } -- cgit v1.2.3 From 795e7efec6ea7e9d597c3fced9f5307fae467cb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Dec 2024 16:08:54 -0800 Subject: srcu: Make SRCU readers use ->srcu_ctrs for counter selection This commit causes SRCU readers to use ->srcu_ctrs for counter selection instead of ->srcu_idx. This takes another step towards array-indexing-free SRCU readers. [ paulmck: Apply kernel test robot feedback. ] Co-developed-by: Z qiang Signed-off-by: Z qiang Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index c794d599db5c..1b01ced61a45 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -101,6 +101,7 @@ struct srcu_usage { */ struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ + struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ @@ -167,6 +168,7 @@ struct srcu_struct { #define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ { \ .sda = &pcpu_name, \ + .srcu_ctrp = &pcpu_name.srcu_ctrs[0], \ __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } @@ -222,13 +224,12 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); */ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_locks.counter); /* Y */ + this_cpu_inc(scp->srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ - return idx; + return scp - &ssp->sda->srcu_ctrs[0]; } /* -- cgit v1.2.3 From 821ca6fa15d864951da89233da8fd89e932d5215 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Dec 2024 16:32:12 -0800 Subject: srcu: Make Tree SRCU updates independent of ->srcu_idx This commit makes Tree SRCU updates independent of ->srcu_idx, then drop ->srcu_idx. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 1b01ced61a45..6b7eba59f384 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -100,7 +100,6 @@ struct srcu_usage { * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - unsigned int srcu_idx; /* Current rdr array element. */ struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; -- cgit v1.2.3 From 780818a68132d45d074353fc66f5f116074c3c14 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Jan 2025 17:07:34 -0800 Subject: srcu: Rename srcu_check_read_flavor_lite() to srcu_check_read_flavor_force() This commit renames the srcu_check_read_flavor_lite() function to srcu_check_read_flavor_force() and adds a read_flavor argument in order to support an srcu_read_lock_fast() variant that is to avoid array indexing in both the lock and unlock primitives. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 2 +- include/linux/srcutiny.h | 2 +- include/linux/srcutree.h | 10 ++++++---- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f6f779b9d9ff..ca00b9af7c23 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -279,7 +279,7 @@ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor_lite(ssp); + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_LITE); retval = __srcu_read_lock_lite(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 1321da803274..b347bde1aac2 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -82,7 +82,7 @@ static inline void srcu_barrier(struct srcu_struct *ssp) } #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) -#define srcu_check_read_flavor_lite(ssp) do { } while (0) +#define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0) /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 6b7eba59f384..e29cc57eac81 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -251,16 +251,18 @@ static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -// Record _lite() usage even for CONFIG_PROVE_RCU=n kernels. -static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) +// Record reader usage even for CONFIG_PROVE_RCU=n kernels. This is +// needed only for flavors that require grace-period smp_mb() calls to be +// promoted to synchronize_rcu(). +static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor) { struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) + if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor)) return; // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered. - __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + __srcu_check_read_flavor(ssp, read_flavor); } // Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. -- cgit v1.2.3 From 4d86b1e7e1e98eb1f0e3c5a4635a5c37cbd22919 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 06:48:15 -0800 Subject: srcu: Add SRCU_READ_FLAVOR_SLOWGP to flag need for synchronize_rcu() This commit switches from a direct test of SRCU_READ_FLAVOR_LITE to a new SRCU_READ_FLAVOR_SLOWGP macro to check for substituting synchronize_rcu() for smp_mb() in SRCU grace periods. Right now, SRCU_READ_FLAVOR_SLOWGP is exactly SRCU_READ_FLAVOR_LITE, but the addition of the _fast() flavor of SRCU will change that. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ca00b9af7c23..505f5bdce444 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -49,6 +49,9 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). #define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ SRCU_READ_FLAVOR_LITE) // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_LITE + // Flavors requiring synchronize_rcu() + // instead of smp_mb(). #ifdef CONFIG_TINY_SRCU #include -- cgit v1.2.3 From f4bde41dd19db5e2ea9e0b4a19ac2573f7244d03 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 14:31:27 -0800 Subject: srcu: Pull pointer-to-integer conversion into __srcu_ptr_to_ctr() This commit abstracts the srcu_read_lock*() pointer-to-integer conversion into a new __srcu_ptr_to_ctr(). This will be used in rcutorture for testing an srcu_read_lock_fast() that returns a pointer rather than an integer. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index e29cc57eac81..f41bb3a55a04 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -211,6 +211,13 @@ void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); +// Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that +// element's index. +static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp) +{ + return scpp - &ssp->sda->srcu_ctrs[0]; +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching @@ -228,7 +235,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); this_cpu_inc(scp->srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ - return scp - &ssp->sda->srcu_ctrs[0]; + return __srcu_ptr_to_ctr(ssp, scp); } /* -- cgit v1.2.3 From 4937096b579a36cfa5764a229d1a89542e10cf5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jan 2025 15:27:24 -0800 Subject: srcu: Pull integer-to-pointer conversion into __srcu_ctr_to_ptr() This commit abstracts the srcu_read_unlock*() integer-to-pointer conversion into a new __srcu_ctr_to_ptr(). This will be used in rcutorture for testing an srcu_read_unlock_fast() that avoids array-indexing overhead by taking a pointer rather than an integer. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index f41bb3a55a04..55fa400624bb 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -218,6 +218,13 @@ static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __ return scpp - &ssp->sda->srcu_ctrs[0]; } +// Converts an integer to a per-CPU pointer to the corresponding +// ->srcu_ctrs[] array element. +static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx) +{ + return &ssp->sda->srcu_ctrs[idx]; +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching @@ -252,7 +259,7 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_ctrs[idx].srcu_unlocks.counter); /* Z */ + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } -- cgit v1.2.3 From 443971156cebfc54a5f5c37a5702c13a2bb06755 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Jan 2025 09:39:35 -0800 Subject: srcu: Move SRCU Tree/Tiny definitions from srcu.h There are a couple of definitions under "#ifdef CONFIG_TINY_SRCU" in include/linux/srcu.h. There is no point in them being there, so this commit moves them to include/linux/srcutiny.h and include/linux/srcutree.c, thus eliminating that #ifdef. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 10 +--------- include/linux/srcutiny.h | 3 +++ include/linux/srcutree.h | 1 + 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 505f5bdce444..2bd0e24e9b55 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -52,6 +52,7 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_LITE // Flavors requiring synchronize_rcu() // instead of smp_mb(). +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); #ifdef CONFIG_TINY_SRCU #include @@ -64,15 +65,6 @@ int init_srcu_struct(struct srcu_struct *ssp); void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); void cleanup_srcu_struct(struct srcu_struct *ssp); -int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); -void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); -#ifdef CONFIG_TINY_SRCU -#define __srcu_read_lock_lite __srcu_read_lock -#define __srcu_read_unlock_lite __srcu_read_unlock -#else // #ifdef CONFIG_TINY_SRCU -int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp); -void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp); -#endif // #else // #ifdef CONFIG_TINY_SRCU void synchronize_srcu(struct srcu_struct *ssp); #define SRCU_GET_STATE_COMPLETED 0x1 diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index b347bde1aac2..a6194f7a7e34 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -71,6 +71,9 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) return idx; } +#define __srcu_read_lock_lite __srcu_read_lock +#define __srcu_read_unlock_lite __srcu_read_unlock + static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) { synchronize_srcu(ssp); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 55fa400624bb..ef3065c0cadc 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -207,6 +207,7 @@ struct srcu_struct { #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); -- cgit v1.2.3 From c4020620528e4e22a051900654a70dcff0ab218d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 Jan 2025 13:19:42 -0800 Subject: srcu: Add SRCU-fast readers This commit adds srcu_read_{,un}lock_fast(), which is similar to srcu_read_{,un}lock_lite(), but avoids the array-indexing and pointer-following overhead. On a microbenchmark featuring tight loops around empty readers, this results in about a 20% speedup compared to RCU Tasks Trace on my x86 laptop. Please note that SRCU-fast has drawbacks compared to RCU Tasks Trace, including: o Lack of CPU stall warnings. o SRCU-fast readers permitted only where rcu_is_watching(). o A pointer-sized return value from srcu_read_lock_fast() must be passed to the corresponding srcu_read_unlock_fast(). o In the absence of readers, a synchronize_srcu() having _fast() readers will incur the latency of at least two normal RCU grace periods. o RCU Tasks Trace priority boosting could be easily added. Boosting SRCU readers is more difficult. SRCU-fast also has a drawback compared to SRCU-lite, namely that the return value from srcu_read_lock_fast()-fast is a 64-bit pointer and that from srcu_read_lock_lite() is only a 32-bit int. [ paulmck: Apply feedback from Akira Yokosawa. ] Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 47 +++++++++++++++++++++++++++++++++++++++++++++-- include/linux/srcutiny.h | 22 ++++++++++++++++++++++ include/linux/srcutree.h | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 2bd0e24e9b55..63bddc301423 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -47,9 +47,10 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). +#define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). #define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ - SRCU_READ_FLAVOR_LITE) // All of the above. -#define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_LITE + SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // Flavors requiring synchronize_rcu() // instead of smp_mb(). void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); @@ -253,6 +254,33 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) return retval; } +/** + * srcu_read_lock_fast - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but for a light-weight + * smp_mb()-free reader. See srcu_read_lock() for more information. + * + * If srcu_read_lock_fast() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. Note that grace-period auto-expediting is disabled for _fast + * srcu_struct structures because auto-expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. + * + * Note that srcu_read_lock_fast() can be invoked only from those contexts + * where RCU is watching, that is, from contexts where it would be legal + * to invoke rcu_read_lock(). Otherwise, lockdep will complain. + */ +static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp) +{ + struct srcu_ctr __percpu *retval; + + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + retval = __srcu_read_lock_fast(ssp); + rcu_try_lock_acquire(&ssp->dep_map); + return retval; +} + /** * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. @@ -356,6 +384,21 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __srcu_read_unlock(ssp, idx); } +/** + * srcu_read_unlock_fast - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast(). + * + * Exit a light-weight SRCU read-side critical section. + */ +static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) + __releases(ssp) +{ + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + srcu_lock_release(&ssp->dep_map); + __srcu_read_unlock_fast(ssp, scp); +} + /** * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index a6194f7a7e34..e271f9f96bfc 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -71,6 +71,28 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) return idx; } +struct srcu_ctr; + +static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp) +{ + return (int)(intptr_t)(struct srcu_ctr __force __kernel *)scpp; +} + +static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx) +{ + return (struct srcu_ctr __percpu *)(intptr_t)idx; +} + +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +{ + return __srcu_ctr_to_ptr(ssp, __srcu_read_lock(ssp)); +} + +static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp)); +} + #define __srcu_read_lock_lite __srcu_read_lock #define __srcu_read_unlock_lite __srcu_read_unlock diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index ef3065c0cadc..bdc467efce3a 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -226,6 +226,44 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss return &ssp->sda->srcu_ctrs[idx]; } +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns a pointer that must be passed to the matching + * srcu_read_unlock_fast(). + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +{ + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); + this_cpu_inc(scp->srcu_locks.counter); /* Y */ + barrier(); /* Avoid leaking the critical section. */ + return scp; +} + +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_fast(), but it must be within the same task. + * + * Note that this_cpu_inc() is an RCU read-side critical section either + * because it disables interrupts, because it is a single instruction, + * or because it is a read-modify-write atomic operation, depending on + * the whims of the architecture. + */ +static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + barrier(); /* Avoid leaking the critical section. */ + this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching -- cgit v1.2.3 From dfe442c943b7ab86f3eb2228e32179bda1402e96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Jan 2025 21:05:37 -0800 Subject: srcu: Fix srcu_read_unlock_{lite,nmisafe}() kernel-doc The srcu_read_unlock_lite() and srcu_read_unlock_nmisafe() both say that their idx parameters must come from srcu_read_lock(). This would be bad, because a given srcu_struct structure may be used only with one flavor of SRCU reader. This commit therefore updates the srcu_read_unlock_lite() kernel-doc header to say that its idx parameter must be obtained from srcu_read_lock_lite() and the srcu_read_unlock_nmisafe() kernel-doc header to say that its idx parameter must be obtained from srcu_read_lock_nmisafe(). Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/srcu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 63bddc301423..a0df80baaccf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -402,7 +402,7 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct /** * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). + * @idx: return value from corresponding srcu_read_lock_lite(). * * Exit a light-weight SRCU read-side critical section. */ @@ -418,7 +418,7 @@ static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). + * @idx: return value from corresponding srcu_read_lock_nmisafe(). * * Exit an SRCU read-side critical section, but in an NMI-safe manner. */ -- cgit v1.2.3 From 729fb74889d94e6051d6ef2b21c769fe7e54f176 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Jan 2025 09:14:18 -0800 Subject: srcu: Document that srcu_{read_lock,down_read}() can share srcu_struct This commit adds a sentence to the srcu_down_read() function's kernel-doc header noting that it is permissible to use srcu_down_read() and srcu_read_lock() on the same srcu_struct, even concurrently. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a0df80baaccf..317eab82a5f0 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -359,7 +359,8 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler. * * Calls to srcu_down_read() may be nested, similar to the manner in - * which calls to down_read() may be nested. + * which calls to down_read() may be nested. The same srcu_struct may be + * used concurrently by srcu_down_read() and srcu_read_lock(). */ static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { -- cgit v1.2.3 From f8b8df19b2dcb528e97395fdb74ca18b61db5207 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Jan 2025 16:06:58 -0800 Subject: srcu: Add srcu_down_read_fast() and srcu_up_read_fast() A pair of matching srcu_read_lock_fast() and srcu_read_unlock_fast() invocations must take place within the same context, for example, within the same task. Otherwise, lockdep complains, as is the right thing to do for most use cases. However, there are use cases involving tracing (for example, uretprobes) in which an SRCU reader needs to begin in one task and end in a timer handler, which might interrupt some other task. This commit therefore supplies the semaphore-like srcu_down_read_fast() and srcu_up_read_fast() functions, which act like srcu_read_lock_fast() and srcu_read_unlock_fast(), but permitting srcu_up_read_fast() to be invoked in a different context than was the matching srcu_down_read_fast(). Neither srcu_down_read_fast() nor srcu_up_read_fast() may be invoked from an NMI handler. Reported-by: Andrii Nakryiko Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: Kent Overstreet Cc: Signed-off-by: Boqun Feng --- include/linux/srcu.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 317eab82a5f0..900b0d5c05f5 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -281,6 +281,24 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * return retval; } +/** + * srcu_down_read_fast - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter a semaphore-like SRCU read-side critical section, but for + * a light-weight smp_mb()-free reader. See srcu_read_lock_fast() and + * srcu_down_read() for more information. + * + * The same srcu_struct may be used concurrently by srcu_down_read_fast() + * and srcu_read_lock_fast(). + */ +static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + return __srcu_read_lock_fast(ssp); +} + /** * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. @@ -400,6 +418,22 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct __srcu_read_unlock_fast(ssp, scp); } +/** + * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast(). + * + * Exit an SRCU read-side critical section, but not necessarily from + * the same context as the maching srcu_down_read_fast(). + */ +static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) + __releases(ssp) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + __srcu_read_unlock_fast(ssp, scp); +} + /** * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. -- cgit v1.2.3 From 3cec27453db49a176e688b7721c3cd26be5ef835 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Jan 2025 18:32:49 -0800 Subject: srcu: Make SRCU-fast also be NMI-safe BPF uses rcu_read_lock_trace() in NMI context, so srcu_read_lock_fast() must be NMI-safe if it is to have any chance of addressing RCU Tasks Trace use cases. This commit therefore causes srcu_read_lock_fast() and srcu_read_unlock_fast() to use atomic_long_inc() instead of this_cpu_inc() on architectures that support NMIs but do not have NMI-safe implementations of this_cpu_inc(). Note that both x86 and arm64 have NMI-safe implementations of this_cpu_inc(), and thus do not pay the performance penalty inherent in atomic_inc_long(). It is tempting to use this trick to fold srcu_read_lock_nmisafe() into srcu_read_lock(), but this would need careful thought, review, and performance analysis. Though those smp_mb() calls might well make performance a non-issue. Reported-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index bdc467efce3a..8bed7e6cc4c1 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -231,17 +231,24 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss * srcu_struct. Returns a pointer that must be passed to the matching * srcu_read_unlock_fast(). * - * Note that this_cpu_inc() is an RCU read-side critical section either - * because it disables interrupts, because it is a single instruction, - * or because it is a read-modify-write atomic operation, depending on - * the whims of the architecture. + * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side + * critical sections either because they disables interrupts, because they + * are a single instruction, or because they are a read-modify-write atomic + * operation, depending on the whims of the architecture. + * + * This means that __srcu_read_lock_fast() is not all that fast + * on architectures that support NMIs but do not supply NMI-safe + * implementations of this_cpu_inc(). */ static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) { struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); - this_cpu_inc(scp->srcu_locks.counter); /* Y */ + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_locks.counter); /* Y */ + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); /* Z */ barrier(); /* Avoid leaking the critical section. */ return scp; } @@ -252,15 +259,22 @@ static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct * different CPU than that which was incremented by the corresponding * srcu_read_lock_fast(), but it must be within the same task. * - * Note that this_cpu_inc() is an RCU read-side critical section either - * because it disables interrupts, because it is a single instruction, - * or because it is a read-modify-write atomic operation, depending on - * the whims of the architecture. + * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side + * critical sections either because they disables interrupts, because they + * are a single instruction, or because they are a read-modify-write atomic + * operation, depending on the whims of the architecture. + * + * This means that __srcu_read_unlock_fast() is not all that fast + * on architectures that support NMIs but do not supply NMI-safe + * implementations of this_cpu_inc(). */ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } -- cgit v1.2.3 From 623b52802bb0b33bee28716002249aae5d89c5ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Nov 2024 14:57:51 -0800 Subject: torture: Add get_torture_init_jiffies() for test-start time This commit adds a get_torture_init_jiffies() function that returns the value of the jiffies counter at the start of the test, that is, at the point where torture_init_begin() was invoked. This will be used to enable torture-test holdoffs for tests implemented using per-CPU kthreads, which are created and deleted by CPU-hotplug operations, and thus (unlike normal kthreads) don't automatically know when the test started. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/torture.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 0134e7221cae..1b59056c3b18 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -104,6 +104,7 @@ int torture_stutter_init(int s, int sgap); /* Initialization and cleanup. */ bool torture_init_begin(char *ttype, int v); void torture_init_end(void); +unsigned long get_torture_init_jiffies(void); bool torture_cleanup_begin(void); void torture_cleanup_end(void); bool torture_must_stop(void); -- cgit v1.2.3 From 0f46d81f2bce970b1c562aa3c944a271bbec2729 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Jan 2025 17:58:00 +0100 Subject: fanotify: notify on mount attach and detach Add notifications for attaching and detaching mounts. The following new event masks are added: FAN_MNT_ATTACH - Mount was attached FAN_MNT_DETACH - Mount was detached If a mount is moved, then the event is reported with (FAN_MNT_ATTACH | FAN_MNT_DETACH). These events add an info record of type FAN_EVENT_INFO_TYPE_MNT containing these fields identifying the affected mounts: __u64 mnt_id - the ID of the mount (see statmount(2)) FAN_REPORT_MNT must be supplied to fanotify_init() to receive these events and no other type of event can be received with this report type. Marks are added with FAN_MARK_MNTNS, which records the mount namespace from an nsfs file (e.g. /proc/self/ns/mnt). Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250129165803.72138-3-mszeredi@redhat.com Signed-off-by: Christian Brauner --- include/linux/fanotify.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 78f660ebc318..3c817dc6292e 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -25,7 +25,7 @@ #define FANOTIFY_FID_BITS (FAN_REPORT_DFID_NAME_TARGET) -#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD) +#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD | FAN_REPORT_MNT) /* * fanotify_init() flags that require CAP_SYS_ADMIN. @@ -38,7 +38,8 @@ FAN_REPORT_PIDFD | \ FAN_REPORT_FD_ERROR | \ FAN_UNLIMITED_QUEUE | \ - FAN_UNLIMITED_MARKS) + FAN_UNLIMITED_MARKS | \ + FAN_REPORT_MNT) /* * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN. @@ -58,7 +59,7 @@ #define FANOTIFY_INTERNAL_GROUP_FLAGS (FANOTIFY_UNPRIV) #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \ - FAN_MARK_FILESYSTEM) + FAN_MARK_FILESYSTEM | FAN_MARK_MNTNS) #define FANOTIFY_MARK_CMD_BITS (FAN_MARK_ADD | FAN_MARK_REMOVE | \ FAN_MARK_FLUSH) @@ -109,10 +110,13 @@ /* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */ #define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR) +#define FANOTIFY_MOUNT_EVENTS (FAN_MNT_ATTACH | FAN_MNT_DETACH) + /* Events that user can request to be notified on */ #define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ FANOTIFY_INODE_EVENTS | \ - FANOTIFY_ERROR_EVENTS) + FANOTIFY_ERROR_EVENTS | \ + FANOTIFY_MOUNT_EVENTS) /* Extra flags that may be reported with event or control handling of events */ #define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR) -- cgit v1.2.3 From 3e0d3cb3fbe06a7bc09d98324a21a446c80f9d3b Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:13 +0100 Subject: ice, irdma: move interrupts code to irdma Move responsibility of MSI-X requesting for RDMA feature from ice driver to irdma driver. It is done to allow simple fallback when there is not enough MSI-X available. Change amount of MSI-X used for control from 4 to 1, as it isn't needed to have more than one MSI-X for this purpose. Reviewed-by: Jacob Keller Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h index 1c1332e4df26..13274c3def66 100644 --- a/include/linux/net/intel/iidc.h +++ b/include/linux/net/intel/iidc.h @@ -78,6 +78,8 @@ int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); +int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an -- cgit v1.2.3 From 79c61899b5eee317907efd1b0d06a1ada0cc00d8 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 4 Feb 2025 18:03:10 +0100 Subject: net-sysfs: remove rtnl_trylock from device attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is an ABBA deadlock between net device unregistration and sysfs files being accessed[1][2]. To prevent this from happening all paths taking the rtnl lock after the sysfs one (actually kn->active refcount) use rtnl_trylock and return early (using restart_syscall)[3], which can make syscalls to spin for a long time when there is contention on the rtnl lock[4]. There are not many possibilities to improve the above: - Rework the entire net/ locking logic. - Invert two locks in one of the paths — not possible. But here it's actually possible to drop one of the locks safely: the kernfs_node refcount. More details in the code itself, which comes with lots of comments. Note that we check the device is alive in the added sysfs_rtnl_lock helper to disallow sysfs operations to run after device dismantle has started. This also help keeping the same behavior as before. Because of this calls to dev_isalive in sysfs ops were removed. [1] https://lore.kernel.org/netdev/49A4D5D5.5090602@trash.net/ [2] https://lore.kernel.org/netdev/m14oyhis31.fsf@fess.ebiederm.org/ [3] https://lore.kernel.org/netdev/20090226084924.16cb3e08@nehalam/ [4] https://lore.kernel.org/all/20210928125500.167943-1-atenart@kernel.org/T/ Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250204170314.146022-2-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 4bc2ee0b10b0..ccaaf4c7d5f6 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -43,6 +43,7 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); +extern int rtnl_lock_interruptible(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); -- cgit v1.2.3 From b7ecc1de51ca7d0a9fa8dbc3f756ab87b99a1838 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 4 Feb 2025 18:03:11 +0100 Subject: net-sysfs: move queue attribute groups outside the default groups Rx/tx queues embed their own kobject for registering their per-queue sysfs files. The issue is they're using the kobject default groups for this and entirely rely on the kobject refcounting for releasing their sysfs paths. In order to remove rtnl_trylock calls we need sysfs files not to rely on their associated kobject refcounting for their release. Thus we here move queues sysfs files from the kobject default groups to their own groups which can be removed separately. Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250204170314.146022-3-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2a59034a5fa2..1dcc76af7520 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -658,6 +658,7 @@ struct netdev_queue { struct Qdisc __rcu *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; + const struct attribute_group **groups; #endif unsigned long tx_maxrate; /* -- cgit v1.2.3 From f9beaf4fac64c84631ba9a2eb864cea6b52032a2 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:06 +0200 Subject: net/mlx5: Change clock in mlx5_core_dev to mlx5_clock pointer Change clock member in mlx5_core_dev to a pointer, so it can point to a clock shared by multiple functions in later patch. For now, each function has its own clock, so mdev in mlx5_clock_priv is the back pointer to the function. Later it points to one (normally the first one) of the multiple functions sharing the same clock. Change mlx5_init_clock() to return error if mlx5_clock is not allocated. Besides, a null clock is defined and used when hardware clock is not supported. So, the clock pointer is always pointing to something valid. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- include/linux/mlx5/driver.h | 31 ++----------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index af86097641b0..5dab3d8d05e4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -54,7 +54,6 @@ #include #include #include -#include #include #define MLX5_ADEV_NAME "mlx5_core" @@ -679,33 +678,7 @@ struct mlx5_rsvd_gids { struct ida ida; }; -#define MAX_PIN_NUM 8 -struct mlx5_pps { - u8 pin_caps[MAX_PIN_NUM]; - struct work_struct out_work; - u64 start[MAX_PIN_NUM]; - u8 enabled; - u64 min_npps_period; - u64 min_out_pulse_duration_ns; -}; - -struct mlx5_timer { - struct cyclecounter cycles; - struct timecounter tc; - u32 nominal_c_mult; - unsigned long overflow_period; -}; - -struct mlx5_clock { - struct mlx5_nb pps_nb; - seqlock_t lock; - struct hwtstamp_config hwtstamp_config; - struct ptp_clock *ptp; - struct ptp_clock_info ptp_info; - struct mlx5_pps pps_info; - struct mlx5_timer timer; -}; - +struct mlx5_clock; struct mlx5_dm; struct mlx5_fw_tracer; struct mlx5_vxlan; @@ -789,7 +762,7 @@ struct mlx5_core_dev { #ifdef CONFIG_MLX5_FPGA struct mlx5_fpga_device *fpga; #endif - struct mlx5_clock clock; + struct mlx5_clock *clock; struct mlx5_ib_clock_info *clock_info; struct mlx5_fw_tracer *tracer; struct mlx5_rsc_dump *rsc_dump; -- cgit v1.2.3 From 574998cf3b3f59afa9e3a6bbb609d9d4eb2023b4 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:07 +0200 Subject: net/mlx5: Add devcom component for the clock shared by functions Add new devcom component for hardware clock. When it is running in real time mode, the functions are grouped by the identify they query. According to firmware document, the clock identify size is 64 bits, so it's safe to memcpy to component key, as the key size is also 64 bits. Signed-off-by: Jianbo Liu Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5dab3d8d05e4..46bd7550adf8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -679,6 +679,7 @@ struct mlx5_rsvd_gids { }; struct mlx5_clock; +struct mlx5_clock_dev_state; struct mlx5_dm; struct mlx5_fw_tracer; struct mlx5_vxlan; @@ -763,6 +764,7 @@ struct mlx5_core_dev { struct mlx5_fpga_device *fpga; #endif struct mlx5_clock *clock; + struct mlx5_clock_dev_state *clock_state; struct mlx5_ib_clock_info *clock_info; struct mlx5_fw_tracer *tracer; struct mlx5_rsc_dump *rsc_dump; -- cgit v1.2.3 From ee0a4fc396f1b6fd1b34e99754896961fb67e4e3 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Feb 2025 23:35:12 +0200 Subject: net/mlx5: Add support for 200Gbps per lane link modes This patch exposes new link modes using 200Gbps per lane, including 200G, 400G and 800G modes. Signed-off-by: Jianbo Liu Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Signed-off-by: Paolo Abeni --- include/linux/mlx5/port.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index e68d42b8ce65..fd625e0dd869 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -115,9 +115,12 @@ enum mlx5e_ext_link_mode { MLX5E_100GAUI_1_100GBASE_CR_KR = 11, MLX5E_200GAUI_4_200GBASE_CR4_KR4 = 12, MLX5E_200GAUI_2_200GBASE_CR2_KR2 = 13, + MLX5E_200GAUI_1_200GBASE_CR1_KR1 = 14, MLX5E_400GAUI_8_400GBASE_CR8 = 15, MLX5E_400GAUI_4_400GBASE_CR4_KR4 = 16, + MLX5E_400GAUI_2_400GBASE_CR2_KR2 = 17, MLX5E_800GAUI_8_800GBASE_CR8_KR8 = 19, + MLX5E_800GAUI_4_800GBASE_CR4_KR4 = 20, MLX5E_EXT_LINK_MODES_NUMBER, }; -- cgit v1.2.3 From bdfa77e7c6bfda57d28fba24a5195fca2392c08a Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 5 Feb 2025 15:34:32 -0600 Subject: vfs: remove some unused old mount api code Remove reconfigure_single, mount_single, and compare_single now that no users remain. Signed-off-by: Eric Sandeen Link: https://lore.kernel.org/r/20250205213931.74614-5-sandeen@redhat.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 --- include/linux/fs_context.h | 2 -- 2 files changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index be3ad155ec9f..ff5e8ab9f951 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2641,9 +2641,6 @@ static inline bool is_mgtime(const struct inode *inode) extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 4b4bfef6f053..a19e4bd32e4d 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -144,8 +144,6 @@ extern void put_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param); extern void fc_drop_locked(struct fs_context *fc); -int reconfigure_single(struct super_block *s, - int flags, void *data); extern int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, -- cgit v1.2.3 From c50105933f0c75aacc4f95c9bf36f7fbd9a83884 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:39:59 +0100 Subject: iomap: allow the file system to submit the writeback bios Change ->prepare_ioend to ->submit_ioend and require file systems that implement it to submit the bio. This is needed for file systems that do their own work on the bios before submitting them to the block layer like btrfs or zoned xfs. To make this easier also pass the writeback context to the method. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-2-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..dc8df4f779d4 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -362,12 +362,14 @@ struct iomap_writeback_ops { loff_t offset, unsigned len); /* - * Optional, allows the file systems to perform actions just before - * submitting the bio and/or override the bio end_io handler for complex - * operations like copy on write extent manipulation or unwritten extent - * conversions. + * Optional, allows the file systems to hook into bio submission, + * including overriding the bi_end_io handler. + * + * Returns 0 if the bio was successfully submitted, or a negative + * error code if status was non-zero or another error happened and + * the bio could not be submitted. */ - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); /* * Optional, allows the file system to discard state on a page where -- cgit v1.2.3 From 710273330663241d9ca5fbed51909e65807556ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:00 +0100 Subject: iomap: simplify io_flags and io_type in struct iomap_ioend The ioend fields for distinct types of I/O are a bit complicated. Consolidate them into a single io_flag field with it's own flags decoupled from the iomap flags. This also prepares for adding a new flag that is unrelated to both of the iomap namespaces. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-3-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index dc8df4f779d4..9583f6456165 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -327,13 +327,29 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset, sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); +/* + * Flags for iomap_ioend->io_flags. + */ +/* shared COW extent */ +#define IOMAP_IOEND_SHARED (1U << 0) +/* unwritten extent */ +#define IOMAP_IOEND_UNWRITTEN (1U << 1) +/* don't merge into previous ioend */ +#define IOMAP_IOEND_BOUNDARY (1U << 2) + +/* + * Flags that if set on either ioend prevent the merge of two ioends. + * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) + */ +#define IOMAP_IOEND_NOMERGE_FLAGS \ + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + /* * Structure for writeback I/O completions. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ - u16 io_type; - u16 io_flags; /* IOMAP_F_* */ + u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ -- cgit v1.2.3 From 034c29fb3e7c119c42e650986e280f025a1bec7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:01 +0100 Subject: iomap: add a IOMAP_F_ANON_WRITE flag Add a IOMAP_F_ANON_WRITE flag that indicates that the write I/O does not have a target block assigned to it yet at iomap time and the file system will do that in the bio submission handler, splitting the I/O as needed. This is used to implement Zone Append based I/O for zoned XFS, where splitting writes to the hardware limits and assigning a zone to them happens just before sending the I/O off to the block layer, but could also be useful for other things like compressed I/O. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-4-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 9583f6456165..eb0764945b42 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -56,6 +56,10 @@ struct vm_fault; * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. + * + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block + * assigned to it yet and the file system will do that in the bio submission + * handler, splitting the I/O as needed. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -68,6 +72,7 @@ struct vm_fault; #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) +#define IOMAP_F_ANON_WRITE (1U << 7) /* * Flags set by the core iomap code during operations: @@ -111,6 +116,8 @@ struct iomap { static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { + if (iomap->flags & IOMAP_F_ANON_WRITE) + return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } -- cgit v1.2.3 From 5fcbd555d48390a8c819ba7fdf55fbfcabe05c80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:02 +0100 Subject: iomap: split bios to zone append limits in the submission handlers Provide helpers for file systems to split bios in the direct I/O and writeback I/O submission handlers. The split ioends are chained to the parent ioend so that only the parent ioend originally generated by the iomap layer will be processed after all the chained off children have completed. This is based on the block layer bio chaining that has supported a similar mechanism for a long time. This Follows btrfs' lead and don't try to build bios to hardware limits for zone append commands, but instead build them as normal unconstrained bios and split them to the hardware limits in the I/O submission handler. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-5-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eb0764945b42..90c27875e39d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -353,12 +353,19 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, /* * Structure for writeback I/O completions. + * + * File systems implementing ->submit_ioend can split a bio generated + * by iomap. In that case the parent ioend it was split from is recorded + * in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of data within eof */ + size_t io_size; /* size of the extent */ + atomic_t io_remaining; /* completetion defer count */ + int io_error; /* stashed away status */ + struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ @@ -408,6 +415,10 @@ struct iomap_writepage_ctx { u32 nr_folios; /* folios added to the ioend */ }; +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, + loff_t file_offset, u16 ioend_flags); +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); @@ -479,4 +490,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ +extern struct bio_set iomap_ioend_bioset; + #endif /* LINUX_IOMAP_H */ -- cgit v1.2.3 From e523f2d4c974a819730830ce1c38834ee0cd7318 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:05 +0100 Subject: iomap: optionally use ioends for direct I/O struct iomap_ioend currently tracks outstanding buffered writes and has some really nice code in core iomap and XFS to merge contiguous I/Os an defer them to userspace for completion in a very efficient way. For zoned writes we'll also need a per-bio user context completion to record the written blocks, and the infrastructure for that would look basically like the ioend handling for buffered I/O. So instead of reinventing the wheel, reuse the existing infrastructure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-8-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 90c27875e39d..5768b9f2a1cc 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -343,20 +343,22 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, #define IOMAP_IOEND_UNWRITTEN (1U << 1) /* don't merge into previous ioend */ #define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) /* * Flags that if set on either ioend prevent the merge of two ioends. * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) */ #define IOMAP_IOEND_NOMERGE_FLAGS \ - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN) + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) /* * Structure for writeback I/O completions. * - * File systems implementing ->submit_ioend can split a bio generated - * by iomap. In that case the parent ioend it was split from is recorded - * in ioend->io_parent. + * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io + * for direct I/O) can split a bio generated by iomap. In that case the parent + * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ -- cgit v1.2.3 From d06244c60aec1d5d1589efe6b611a5b91a49465c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:06 +0100 Subject: iomap: add a io_private field to struct iomap_ioend Add a private data field to struct iomap_ioend so that the file system can attach information to it. Zoned XFS will use this for a pointer to the open zone. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-9-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5768b9f2a1cc..b4be07e8ec94 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -370,6 +370,7 @@ struct iomap_ioend { struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ + void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; -- cgit v1.2.3 From 02b39c4655d52141e07e80e9b2772d96daf67ff6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:07 +0100 Subject: iomap: pass private data to iomap_page_mkwrite Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-10-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b4be07e8ec94..d528eb4d5cfe 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -316,9 +316,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, - const struct iomap_ops *ops); - +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, -- cgit v1.2.3 From c6d1b8d15450cf061648d4e36d622da9d755654a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:08 +0100 Subject: iomap: pass private data to iomap_zero_range Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-11-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d528eb4d5cfe..eddf524ac749 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -313,7 +313,7 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, -- cgit v1.2.3 From ddd402bbbf669c4ada106fd2e4c799e2b5745e3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:40:09 +0100 Subject: iomap: pass private data to iomap_truncate_page Allow the file system to pass private data which can be used by the iomap_begin and iomap_end methods through the private pointer in the iomap_iter structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250206064035.2323428-12-hch@lst.de Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eddf524ac749..022d7f338c68 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -315,7 +315,7 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); + const struct iomap_ops *ops, void *private); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, -- cgit v1.2.3 From 958c3a6706863f9f77b21c90d2428473441cd8a1 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 23 Jan 2025 08:44:17 +0000 Subject: efi/cper, cxl: Make definitions and structures global In preparation to add tracepoint support, move protocol error UUID definition to a common location, Also, make struct CXL RAS capability, cxl_cper_sec_prot_err and CPER validation flags global for use across different modules. Signed-off-by: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Fan Ni Reviewed-by: Gregory Price Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250123084421.127697-3-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- include/linux/cper.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cper.h b/include/linux/cper.h index 265b0f8fc0b3..5c6d4d5b9975 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -89,6 +89,10 @@ enum { #define CPER_NOTIFY_DMAR \ GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ 0x72, 0x2D, 0xEB, 0x41) +/* CXL Protocol Error Section */ +#define CPER_SEC_CXL_PROT_ERR \ + GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \ + 0x4B, 0x77, 0x10, 0x48) /* CXL Event record UUIDs are formatted as GUIDs and reported in section type */ /* -- cgit v1.2.3 From 61eac5f7f6439e8fe99b5fb29406acb0fd7b83c6 Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Thu, 23 Jan 2025 08:44:18 +0000 Subject: efi/cper, cxl: Remove cper_cxl.h Move the declaration of cxl_cper_print_prot_err() to include/linux/cper.h to avoid maintaining a separate header file just for this function declaration. Remove drivers/firmware/efi/cper_cxl.h as its contents have been reorganized. No functional changes. Signed-off-by: Smita Koralahalli Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Fan Ni Reviewed-by: Gregory Price Reviewed-by: Dan Williams Link: https://patch.msgid.link/20250123084421.127697-4-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- include/linux/cper.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cper.h b/include/linux/cper.h index 5c6d4d5b9975..0ed60a91eca9 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -605,4 +605,8 @@ void cper_estatus_print(const char *pfx, int cper_estatus_check_header(const struct acpi_hest_generic_status *estatus); int cper_estatus_check(const struct acpi_hest_generic_status *estatus); +struct cxl_cper_sec_prot_err; +void cxl_cper_print_prot_err(const char *pfx, + const struct cxl_cper_sec_prot_err *prot_err); + #endif -- cgit v1.2.3 From 486729c6012042c486db2a5e4d5dd034fb1d3f3c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:40:54 +0530 Subject: cpufreq: Remove cpufreq_generic_attrs All users of cpufreq_generic_attr are migrated now, remove it. While at it, also stop exporting attributes for available and boost frequencies as they are only used by cpufreq core now. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7fe0981a7e46..d237ef91d1f1 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1198,7 +1198,6 @@ void arch_set_freq_scale(const struct cpumask *cpus, /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; -extern struct freq_attr *cpufreq_generic_attr[]; int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy); unsigned int cpufreq_generic_get(unsigned int cpu); -- cgit v1.2.3 From 1f04815057a4c1ca557448b56ad5ab536978540e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 10:48:34 +0530 Subject: cpufreq: staticize cpufreq_boost_trigger_state() cpufreq_boost_trigger_state() is only used by cpufreq core, mark it static. Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d237ef91d1f1..0e708830d30d 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -778,7 +778,6 @@ int cpufreq_frequency_table_get_index(struct cpufreq_policy *policy, ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); #ifdef CONFIG_CPU_FREQ -int cpufreq_boost_trigger_state(int state); bool cpufreq_boost_enabled(void); int cpufreq_enable_boost_support(void); bool policy_has_boost_freq(struct cpufreq_policy *policy); @@ -1150,10 +1149,6 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ return 0; } #else -static inline int cpufreq_boost_trigger_state(int state) -{ - return 0; -} static inline bool cpufreq_boost_enabled(void) { return false; -- cgit v1.2.3 From 9a23eb8b2b5d8c5f1129c5a523a786ddd53cd7c9 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 13:09:35 +0530 Subject: cpufreq: Export cpufreq_boost_set_sw() This will be used directly by cpufreq driver going forward, export it. Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 0e708830d30d..c7d1fe5ebf7a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -781,6 +781,7 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); bool cpufreq_boost_enabled(void); int cpufreq_enable_boost_support(void); bool policy_has_boost_freq(struct cpufreq_policy *policy); +int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state); /* Find lowest freq at or above target in a table in ascending order */ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, @@ -1164,6 +1165,11 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) return false; } +static inline int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) +{ + return -EOPNOTSUPP; +} + static inline int cpufreq_table_set_inefficient(struct cpufreq_policy *policy, unsigned int frequency) -- cgit v1.2.3 From 1f7d1bab50e6ae517f8b6699e56d709d61ae13e5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 11:04:25 +0530 Subject: cpufreq: Introduce policy->boost_supported flag It is possible to have a scenario where not all cpufreq policies support boost frequencies. And letting sysfs (or other parts of the kernel) enable boost feature for that policy isn't correct. Add a new flag, boost_supported, which will be set to true by the cpufreq core only if the freq table contains valid boost frequencies. Some cpufreq drivers though don't have boost frequencies in the freq-table, they can set this flag from their ->init() callbacks. Once all the drivers are updated to set the flag correctly, we can check it before enabling boost feature for a policy. Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index c7d1fe5ebf7a..b017af4398b9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -144,6 +144,9 @@ struct cpufreq_policy { /* Per policy boost enabled flag. */ bool boost_enabled; + /* Per policy boost supported flag. */ + bool boost_supported; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; unsigned int cached_resolved_idx; -- cgit v1.2.3 From c952775a3d72b0505c2f8f4171929c293479f0fd Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 14:27:11 +0530 Subject: cpufreq: staticize policy_has_boost_freq() policy_has_boost_freq() isn't used outside of freq_table.c now, mark it static. Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b017af4398b9..466d186166da 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -783,7 +783,6 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); #ifdef CONFIG_CPU_FREQ bool cpufreq_boost_enabled(void); int cpufreq_enable_boost_support(void); -bool policy_has_boost_freq(struct cpufreq_policy *policy); int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state); /* Find lowest freq at or above target in a table in ascending order */ @@ -1163,11 +1162,6 @@ static inline int cpufreq_enable_boost_support(void) return -EINVAL; } -static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) -{ - return false; -} - static inline int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) { return -EOPNOTSUPP; -- cgit v1.2.3 From 0322f3e89b4eb32702321687a7636ee5290fe255 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 14:26:20 +0530 Subject: cpufreq: Remove cpufreq_enable_boost_support() Remove the now unused helper, cpufreq_enable_boost_support(). Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 466d186166da..cefd853abfd1 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -782,7 +782,6 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); #ifdef CONFIG_CPU_FREQ bool cpufreq_boost_enabled(void); -int cpufreq_enable_boost_support(void); int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state); /* Find lowest freq at or above target in a table in ascending order */ @@ -1157,11 +1156,6 @@ static inline bool cpufreq_boost_enabled(void) return false; } -static inline int cpufreq_enable_boost_support(void) -{ - return -EINVAL; -} - static inline int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) { return -EOPNOTSUPP; -- cgit v1.2.3 From 7903f907a226058ed99f86e9924e082aea57fc45 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 17:44:13 +0100 Subject: pid: perform free_pid() calls outside of tasklist_lock As the clone side already executes pid allocation with only pidmap_lock held, issuing free_pid() while still holding tasklist_lock exacerbates total hold time of the latter. More things may show up later which require initial clean up with the lock held and allow finishing without it. For that reason a struct to collect such work is added instead of merely passing the pid array. Reviewed-by: Oleg Nesterov Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206164415.450051-5-mjguzik@gmail.com Acked-by: "Liam R. Howlett" Signed-off-by: Christian Brauner --- include/linux/pid.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index 98837a1ff0f3..311ecebd7d56 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); * these helpers must be called with the tasklist_lock write-held. */ extern void attach_pid(struct task_struct *task, enum pid_type); -extern void detach_pid(struct task_struct *task, enum pid_type); -extern void change_pid(struct task_struct *task, enum pid_type, - struct pid *pid); +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type); +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type, + struct pid *pid); extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); @@ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size); extern void free_pid(struct pid *pid); +void free_pids(struct pid **pids); extern void disable_pid_allocation(struct pid_namespace *ns); /* -- cgit v1.2.3 From 8d3bbe4355aded32961b9009b31de6d41b7352e9 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 5 Feb 2025 12:42:21 +0000 Subject: of: base: Add of_get_available_child_by_name() There are lot of drivers using of_get_child_by_name() followed by of_device_is_available() to find the available child node by name for a given parent. Provide a helper for these users to simplify the code. Suggested-by: Geert Uytterhoeven Reviewed-by: Rob Herring Signed-off-by: Biju Das Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/of.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index eaf0e2a2b75c..9d6b8a61607f 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -301,6 +301,8 @@ extern struct device_node *of_get_compatible_child(const struct device_node *par const char *compatible); extern struct device_node *of_get_child_by_name(const struct device_node *node, const char *name); +extern struct device_node *of_get_available_child_by_name(const struct device_node *node, + const char *name); /* cache lookup */ extern struct device_node *of_find_next_cache_node(const struct device_node *); @@ -578,6 +580,13 @@ static inline struct device_node *of_get_child_by_name( return NULL; } +static inline struct device_node *of_get_available_child_by_name( + const struct device_node *node, + const char *name) +{ + return NULL; +} + static inline int of_device_is_compatible(const struct device_node *device, const char *name) { -- cgit v1.2.3 From c600a55922640b1c4dcfdc5a694cadd2dd9d1599 Mon Sep 17 00:00:00 2001 From: Tatsuya S Date: Mon, 6 Jan 2025 16:49:11 +0900 Subject: HID: core: Add reserved item tag for main items For main items, separate warning of reserved item tag from warning of unknown item tag. This comes from 6.2.2.4 Main Items of Device Class Definition for HID 1.11 specification. Signed-off-by: Tatsuya S Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index cdc0dc13c87f..3f01cd77f6d4 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -81,6 +81,8 @@ struct hid_item { #define HID_MAIN_ITEM_TAG_FEATURE 11 #define HID_MAIN_ITEM_TAG_BEGIN_COLLECTION 10 #define HID_MAIN_ITEM_TAG_END_COLLECTION 12 +#define HID_MAIN_ITEM_TAG_RESERVED_MIN 13 +#define HID_MAIN_ITEM_TAG_RESERVED_MAX 15 /* * HID report descriptor main item contents -- cgit v1.2.3 From b8974b89603c601f64b302d80129b0faf4879199 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:36 +0800 Subject: mm: vmstat: move sysctls to mm/vmstat.c This moves all vmstat related sysctls to its own file, removes useless extern variable declarations, and do some related clean-ups. To avoid compiler warnings when CONFIG_PROC_FS is not defined, add the macro definition CONFIG_PROC_FS ahead CONFIG_NUMA in vmstat.c. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/vmstat.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9f3a04345b86..4751e3ecc467 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -10,15 +10,8 @@ #include #include -extern int sysctl_stat_interval; - #ifdef CONFIG_NUMA -#define ENABLE_NUMA_STAT 1 -#define DISABLE_NUMA_STAT 0 -extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); -int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { @@ -304,10 +297,6 @@ void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); -struct ctl_table; -int vmstat_refresh(const struct ctl_table *, int write, void *buffer, size_t *lenp, - loff_t *ppos); - void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); int calculate_pressure_threshold(struct zone *zone); -- cgit v1.2.3 From 73aa354af21d4c09b31e9b287f4cb32a9b4e9c8c Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:37 +0800 Subject: mm: filemap: move sysctl to mm/filemap.c This moves the filemap related sysctl to mm/filemap.c, and removes the redundant external variable declaration. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..70cca450a68a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -40,8 +40,6 @@ struct user_struct; struct pt_regs; struct folio_batch; -extern int sysctl_page_lock_unfairness; - void mm_core_init(void); void init_mm_internals(void); -- cgit v1.2.3 From 7e05627ee17bf30422705f94627cf9da70683887 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:38 +0800 Subject: mm: swap: move sysctl to mm/swap.c The page-cluster belongs to mm/swap.c, move it to mm/swap.c . Removes the redundant external variable declaration and unneeded include(linux/swap.h). Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 70cca450a68a..23cfaa40fef3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -76,8 +76,6 @@ static inline void totalram_pages_add(long count) } extern void * high_memory; -extern int page_cluster; -extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; -- cgit v1.2.3 From 538d5baacd8a01b814777af1c9f1f9740a714707 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:39 +0800 Subject: mm: vmscan: move vmscan sysctls to mm/vmscan.c This moves vm_swappiness and zone_reclaim_mode to mm/vmscan.c, as part of the kernel/sysctl.c cleaning, also moves some external variable declarations and function declarations from include/linux/swap.h into mm/internal.h. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/swap.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index b13b72645db3..a98c757400fe 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -433,19 +433,10 @@ extern int vm_swappiness; long remove_mapping(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_NUMA -extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; -#else -#define node_reclaim_mode 0 #endif -static inline bool node_reclaim_enabled(void) -{ - /* Is any node_reclaim_mode bit set? */ - return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); -} - void check_move_unevictable_folios(struct folio_batch *fbatch); extern void __meminit kswapd_run(int nid); -- cgit v1.2.3 From b1e8d7134eb61677c8a3207004d1ce34305d1f35 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:40 +0800 Subject: mm: util: move sysctls to mm/util.c This moves all util related sysctls to mm/util.c, as part of the kernel/sysctl.c cleaning, also removes redundant external variable declarations and function declarations. Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/mm.h | 11 ----------- include/linux/mman.h | 2 -- 2 files changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 23cfaa40fef3..7e0018222f64 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -205,17 +205,6 @@ extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; -extern unsigned long sysctl_overcommit_kbytes; - -int overcommit_ratio_handler(const struct ctl_table *, int, void *, size_t *, - loff_t *); -int overcommit_kbytes_handler(const struct ctl_table *, int, void *, size_t *, - loff_t *); -int overcommit_policy_handler(const struct ctl_table *, int, void *, size_t *, - loff_t *); - #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) diff --git a/include/linux/mman.h b/include/linux/mman.h index a842783ffa62..bce214fece16 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -59,8 +59,6 @@ | MAP_HUGE_1GB) extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; -extern unsigned long sysctl_overcommit_kbytes; extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP -- cgit v1.2.3 From 97f5420ef1f4cc5e48ac90f5496039b31cf74036 Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:43 +0800 Subject: mm: nommu: move sysctl to mm/nommu.c The sysctl_nr_trim_pages belongs to nommu.c, move it to mm/nommu.c from /kernel/sysctl.c. And remove the useless extern variable declaration from include/linux/mm.h Signed-off-by: Kaixiong Yu Reviewed-by: Lorenzo Stoakes Signed-off-by: Joel Granados --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7e0018222f64..1c470e505bd4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4066,8 +4066,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif -extern int sysctl_nr_trim_pages; - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, -- cgit v1.2.3 From 94eed61d5877c39b9dcdb9079aa6e0907aeb5f7d Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:44 +0800 Subject: fs: fs-writeback: move sysctl to fs/fs-writeback.c The dirtytime_expire_interval belongs to fs/fs-writeback.c, move it to fs/fs-writeback.c from /kernel/sysctl.c. And remove the useless extern variable declaration and the function declaration from include/linux/writeback.h Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jan Kara Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/writeback.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d11b903c2edb..caf4f0b12235 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -327,12 +327,8 @@ extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; -extern unsigned int dirtytime_expire_interval; extern int laptop_mode; -int dirtytime_interval_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); - void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); unsigned long cgwb_calc_thresh(struct bdi_writeback *wb); -- cgit v1.2.3 From f5d64ae331d00c27cc6fc615811a9ba13de5fb0e Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:45 +0800 Subject: fs: drop_caches: move sysctl to fs/drop_caches.c The sysctl_drop_caches to fs/drop_caches.c, move it to fs/drop_caches.c from /kernel/sysctl.c. And remove the useless extern variable declaration from include/linux/mm.h Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/mm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1c470e505bd4..f5ba3ed8b44a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3789,12 +3789,6 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); -#ifdef CONFIG_SYSCTL -extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(const struct ctl_table *, int, void *, size_t *, - loff_t *); -#endif - void drop_slab(void); #ifndef CONFIG_MMU -- cgit v1.2.3 From 52e66823e0bea9adbe3e16075b5d76867ca695aa Mon Sep 17 00:00:00 2001 From: Kaixiong Yu Date: Sat, 11 Jan 2025 15:07:47 +0800 Subject: fs: dcache: move the sysctl to fs/dcache.c The sysctl_vfs_cache_pressure belongs to fs/dcache.c, move it to fs/dcache.c from kernel/sysctl.c. As a part of fs/dcache.c cleaning, sysctl_vfs_cache_pressure is changed to a static variable, and change the inline-type function vfs_pressure_ratio() to out-of-inline type, export vfs_pressure_ratio() with EXPORT_SYMBOL_GPL to be used by other files. Move the unneeded include(linux/dcache.h). Signed-off-by: Kaixiong Yu Reviewed-by: Kees Cook Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Reviewed-by: Jeff Layton Signed-off-by: Joel Granados --- include/linux/dcache.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4afb60365675..f861e709b385 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -520,12 +520,7 @@ static inline int simple_positive(const struct dentry *dentry) return d_really_is_positive(dentry) && !d_unhashed(dentry); } -extern int sysctl_vfs_cache_pressure; - -static inline unsigned long vfs_pressure_ratio(unsigned long val) -{ - return mult_frac(val, sysctl_vfs_cache_pressure, 100); -} +unsigned long vfs_pressure_ratio(unsigned long val); /** * d_inode - Get the actual inode of this dentry -- cgit v1.2.3 From c8006fbd0f4fd15fa990786ff9a097b45eef616c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 27 Jan 2025 21:58:59 +0000 Subject: bus: mhi: host: Remove unused functions mhi_device_get() and mhi_queue_dma() haven't been used since 2020's commit 189ff97cca53 ("bus: mhi: core: Add support for data transfer") added them. Remove them. Note that mhi_queue_dma_sync() is used and has been left. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20250127215859.261105-1-linux@treblig.org Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 059dc94d20bb..dd372b0123a6 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -720,12 +720,6 @@ enum mhi_state mhi_get_mhi_state(struct mhi_controller *mhi_cntrl); */ void mhi_soc_reset(struct mhi_controller *mhi_cntrl); -/** - * mhi_device_get - Disable device low power mode - * @mhi_dev: Device associated with the channel - */ -void mhi_device_get(struct mhi_device *mhi_dev); - /** * mhi_device_get_sync - Disable device low power mode. Synchronously * take the controller out of suspended state @@ -776,18 +770,6 @@ int mhi_prepare_for_transfer_autoqueue(struct mhi_device *mhi_dev); */ void mhi_unprepare_from_transfer(struct mhi_device *mhi_dev); -/** - * mhi_queue_dma - Send or receive DMA mapped buffers from client device - * over MHI channel - * @mhi_dev: Device associated with the channels - * @dir: DMA direction for the channel - * @mhi_buf: Buffer for holding the DMA mapped data - * @len: Buffer length - * @mflags: MHI transfer flags used for the transfer - */ -int mhi_queue_dma(struct mhi_device *mhi_dev, enum dma_data_direction dir, - struct mhi_buf *mhi_buf, size_t len, enum mhi_flags mflags); - /** * mhi_queue_buf - Send or receive raw buffers from client device over MHI * channel -- cgit v1.2.3 From 8e02d188698851436f76038ea998b726193d1b10 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 7 Feb 2025 14:08:58 -0600 Subject: spi: add basic support for SPI offloading Add the basic infrastructure to support SPI offload providers and consumers. SPI offloading is a feature that allows the SPI controller to perform transfers without any CPU intervention. This is useful, e.g. for high-speed data acquisition. SPI controllers with offload support need to implement the get_offload and put_offload callbacks and can use the devm_spi_offload_alloc() to allocate offload instances. SPI peripheral drivers will call devm_spi_offload_get() to get a reference to the matching offload instance. This offload instance can then be attached to a SPI message to request offloading that message. It is expected that SPI controllers with offload support will check for the offload instance in the SPI message in the ctlr->optimize_message() callback and handle it accordingly. CONFIG_SPI_OFFLOAD is intended to be a select-only option. Both consumer and provider drivers should `select SPI_OFFLOAD` in their Kconfig to ensure that the SPI core is built with offload support. Reviewed-by: Jonathan Cameron Reviewed-by: Nuno Sa Signed-off-by: David Lechner Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-1-e48a489be48c@baylibre.com Signed-off-by: Mark Brown --- include/linux/spi/offload/consumer.h | 22 ++++++++++++++++++ include/linux/spi/offload/provider.h | 19 ++++++++++++++++ include/linux/spi/offload/types.h | 43 ++++++++++++++++++++++++++++++++++++ include/linux/spi/spi.h | 17 ++++++++++++++ 4 files changed, 101 insertions(+) create mode 100644 include/linux/spi/offload/consumer.h create mode 100644 include/linux/spi/offload/provider.h create mode 100644 include/linux/spi/offload/types.h (limited to 'include/linux') diff --git a/include/linux/spi/offload/consumer.h b/include/linux/spi/offload/consumer.h new file mode 100644 index 000000000000..05543dbedf30 --- /dev/null +++ b/include/linux/spi/offload/consumer.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 Analog Devices Inc. + * Copyright (C) 2024 BayLibre, SAS + */ + +#ifndef __LINUX_SPI_OFFLOAD_CONSUMER_H +#define __LINUX_SPI_OFFLOAD_CONSUMER_H + +#include +#include +#include + +MODULE_IMPORT_NS("SPI_OFFLOAD"); + +struct device; +struct spi_device; + +struct spi_offload *devm_spi_offload_get(struct device *dev, struct spi_device *spi, + const struct spi_offload_config *config); + +#endif /* __LINUX_SPI_OFFLOAD_CONSUMER_H */ diff --git a/include/linux/spi/offload/provider.h b/include/linux/spi/offload/provider.h new file mode 100644 index 000000000000..278c4edfcdb7 --- /dev/null +++ b/include/linux/spi/offload/provider.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 Analog Devices Inc. + * Copyright (C) 2024 BayLibre, SAS + */ + +#ifndef __LINUX_SPI_OFFLOAD_PROVIDER_H +#define __LINUX_SPI_OFFLOAD_PROVIDER_H + +#include +#include + +MODULE_IMPORT_NS("SPI_OFFLOAD"); + +struct device; + +struct spi_offload *devm_spi_offload_alloc(struct device *dev, size_t priv_size); + +#endif /* __LINUX_SPI_OFFLOAD_PROVIDER_H */ diff --git a/include/linux/spi/offload/types.h b/include/linux/spi/offload/types.h new file mode 100644 index 000000000000..a74f8d84541b --- /dev/null +++ b/include/linux/spi/offload/types.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 Analog Devices Inc. + * Copyright (C) 2024 BayLibre, SAS + */ + +#ifndef __LINUX_SPI_OFFLOAD_TYPES_H +#define __LINUX_SPI_OFFLOAD_TYPES_H + +#include + +struct device; + +/* Offload can be triggered by external hardware event. */ +#define SPI_OFFLOAD_CAP_TRIGGER BIT(0) +/* Offload can record and then play back TX data when triggered. */ +#define SPI_OFFLOAD_CAP_TX_STATIC_DATA BIT(1) +/* Offload can get TX data from an external stream source. */ +#define SPI_OFFLOAD_CAP_TX_STREAM_DMA BIT(2) +/* Offload can send RX data to an external stream sink. */ +#define SPI_OFFLOAD_CAP_RX_STREAM_DMA BIT(3) + +/** + * struct spi_offload_config - offload configuration + * + * This is used to request an offload with specific configuration. + */ +struct spi_offload_config { + /** @capability_flags: required capabilities. See %SPI_OFFLOAD_CAP_* */ + u32 capability_flags; +}; + +/** + * struct spi_offload - offload instance + */ +struct spi_offload { + /** @provider_dev: for get/put reference counting */ + struct device *provider_dev; + /** @priv: provider driver private data */ + void *priv; +}; + +#endif /* __LINUX_SPI_OFFLOAD_TYPES_H */ diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 8497f4747e24..98bdc8c16c20 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -31,6 +31,8 @@ struct spi_transfer; struct spi_controller_mem_ops; struct spi_controller_mem_caps; struct spi_message; +struct spi_offload; +struct spi_offload_config; /* * INTERFACES between SPI master-side drivers and SPI slave protocol handlers, @@ -496,6 +498,10 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @mem_ops: optimized/dedicated operations for interactions with SPI memory. * This field is optional and should only be implemented if the * controller has native support for memory like operations. + * @get_offload: callback for controllers with offload support to get matching + * offload instance. Implementations should return -ENODEV if no match is + * found. + * @put_offload: release the offload instance acquired by @get_offload. * @mem_caps: controller capabilities for the handling of memory operations. * @unprepare_message: undo any work done by prepare_message(). * @target_abort: abort the ongoing transfer request on an SPI target controller @@ -740,6 +746,10 @@ struct spi_controller { const struct spi_controller_mem_ops *mem_ops; const struct spi_controller_mem_caps *mem_caps; + struct spi_offload *(*get_offload)(struct spi_device *spi, + const struct spi_offload_config *config); + void (*put_offload)(struct spi_offload *offload); + /* GPIO chip select */ struct gpio_desc **cs_gpiods; bool use_gpio_descriptors; @@ -1108,6 +1118,7 @@ struct spi_transfer { * @state: for use by whichever driver currently owns the message * @opt_state: for use by whichever driver currently owns the message * @resources: for resource management when the SPI message is processed + * @offload: (optional) offload instance used by this message * * A @spi_message is used to execute an atomic sequence of data transfers, * each represented by a struct spi_transfer. The sequence is "atomic" @@ -1168,6 +1179,12 @@ struct spi_message { */ void *opt_state; + /* + * Optional offload instance used by this message. This must be set + * by the peripheral driver before calling spi_optimize_message(). + */ + struct spi_offload *offload; + /* List of spi_res resources when the SPI message is processed */ struct list_head resources; }; -- cgit v1.2.3 From d7231be4b4657e5f922a4c6dc11e8dffc71fee87 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 7 Feb 2025 14:08:59 -0600 Subject: spi: offload: add support for hardware triggers Extend SPI offloading to support hardware triggers. This allows an arbitrary hardware trigger to be used to start a SPI transfer that was previously set up with spi_optimize_message(). A new struct spi_offload_trigger is introduced that can be used to configure any type of trigger. It has a type discriminator and a union to allow it to be extended in the future. Two trigger types are defined to start with. One is a trigger that indicates that the SPI peripheral is ready to read or write data. The other is a periodic trigger to repeat a SPI message at a fixed rate. There is also a spi_offload_hw_trigger_validate() function that works similar to clk_round_rate(). It basically asks the question of if we enabled the hardware trigger what would the actual parameters be. This can be used to test if the requested trigger type is actually supported by the hardware and for periodic triggers, it can be used to find the actual rate that the hardware is capable of. Reviewed-by: Jonathan Cameron Reviewed-by: Nuno Sa Signed-off-by: David Lechner Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-2-e48a489be48c@baylibre.com Signed-off-by: Mark Brown --- include/linux/spi/offload/consumer.h | 12 ++++++++++++ include/linux/spi/offload/provider.h | 28 +++++++++++++++++++++++++++ include/linux/spi/offload/types.h | 37 ++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/offload/consumer.h b/include/linux/spi/offload/consumer.h index 05543dbedf30..5a0ec5303d60 100644 --- a/include/linux/spi/offload/consumer.h +++ b/include/linux/spi/offload/consumer.h @@ -19,4 +19,16 @@ struct spi_device; struct spi_offload *devm_spi_offload_get(struct device *dev, struct spi_device *spi, const struct spi_offload_config *config); +struct spi_offload_trigger +*devm_spi_offload_trigger_get(struct device *dev, + struct spi_offload *offload, + enum spi_offload_trigger_type type); +int spi_offload_trigger_validate(struct spi_offload_trigger *trigger, + struct spi_offload_trigger_config *config); +int spi_offload_trigger_enable(struct spi_offload *offload, + struct spi_offload_trigger *trigger, + struct spi_offload_trigger_config *config); +void spi_offload_trigger_disable(struct spi_offload *offload, + struct spi_offload_trigger *trigger); + #endif /* __LINUX_SPI_OFFLOAD_CONSUMER_H */ diff --git a/include/linux/spi/offload/provider.h b/include/linux/spi/offload/provider.h index 278c4edfcdb7..76c7cf651092 100644 --- a/include/linux/spi/offload/provider.h +++ b/include/linux/spi/offload/provider.h @@ -8,12 +8,40 @@ #define __LINUX_SPI_OFFLOAD_PROVIDER_H #include +#include #include MODULE_IMPORT_NS("SPI_OFFLOAD"); struct device; +struct spi_offload_trigger; struct spi_offload *devm_spi_offload_alloc(struct device *dev, size_t priv_size); +struct spi_offload_trigger_ops { + bool (*match)(struct spi_offload_trigger *trigger, + enum spi_offload_trigger_type type, u64 *args, u32 nargs); + int (*request)(struct spi_offload_trigger *trigger, + enum spi_offload_trigger_type type, u64 *args, u32 nargs); + void (*release)(struct spi_offload_trigger *trigger); + int (*validate)(struct spi_offload_trigger *trigger, + struct spi_offload_trigger_config *config); + int (*enable)(struct spi_offload_trigger *trigger, + struct spi_offload_trigger_config *config); + void (*disable)(struct spi_offload_trigger *trigger); +}; + +struct spi_offload_trigger_info { + /** @fwnode: Provider fwnode, used to match to consumer. */ + struct fwnode_handle *fwnode; + /** @ops: Provider-specific callbacks. */ + const struct spi_offload_trigger_ops *ops; + /** Provider-specific state to be used in callbacks. */ + void *priv; +}; + +int devm_spi_offload_trigger_register(struct device *dev, + struct spi_offload_trigger_info *info); +void *spi_offload_trigger_get_priv(struct spi_offload_trigger *trigger); + #endif /* __LINUX_SPI_OFFLOAD_PROVIDER_H */ diff --git a/include/linux/spi/offload/types.h b/include/linux/spi/offload/types.h index a74f8d84541b..7476f2073b02 100644 --- a/include/linux/spi/offload/types.h +++ b/include/linux/spi/offload/types.h @@ -38,6 +38,43 @@ struct spi_offload { struct device *provider_dev; /** @priv: provider driver private data */ void *priv; + /** @ops: callbacks for offload support */ + const struct spi_offload_ops *ops; +}; + +enum spi_offload_trigger_type { + /* Indication from SPI peripheral that data is read to read. */ + SPI_OFFLOAD_TRIGGER_DATA_READY, + /* Trigger comes from a periodic source such as a clock. */ + SPI_OFFLOAD_TRIGGER_PERIODIC, +}; + +struct spi_offload_trigger_periodic { + u64 frequency_hz; +}; + +struct spi_offload_trigger_config { + /** @type: type discriminator for union */ + enum spi_offload_trigger_type type; + union { + struct spi_offload_trigger_periodic periodic; + }; +}; + +/** + * struct spi_offload_ops - callbacks implemented by offload providers + */ +struct spi_offload_ops { + /** + * @trigger_enable: Optional callback to enable the trigger for the + * given offload instance. + */ + int (*trigger_enable)(struct spi_offload *offload); + /** + * @trigger_disable: Optional callback to disable the trigger for the + * given offload instance. + */ + void (*trigger_disable)(struct spi_offload *offload); }; #endif /* __LINUX_SPI_OFFLOAD_TYPES_H */ -- cgit v1.2.3 From 700a281905f2a4ccf6f3b2d3cd6985e034b4b021 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 7 Feb 2025 14:09:02 -0600 Subject: spi: add offload TX/RX streaming APIs Most configuration of SPI offloads is handled opaquely using the offload pointer that is passed to the various offload functions. However, there are some offload features that need to be controlled on a per transfer basis. This patch adds a flag field to struct spi_transfer to allow specifying such features. The first feature to be added is the ability to stream data to/from a hardware sink/source rather than using a tx or rx buffer. Additional flags can be added in the future as needed. A flags field is also added to the offload struct for providers to indicate which flags are supported. This allows for generic checking of offload capabilities during __spi_validate() so that each offload provider doesn't have to implement their own validation. As a first users of this streaming capability, getter functions are added to get a DMA channel that is directly connected to the offload. Peripheral drivers will use this to get a DMA channel and configure it to suit their needs. Reviewed-by: Jonathan Cameron Reviewed-by: Nuno Sa Signed-off-by: David Lechner Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-5-e48a489be48c@baylibre.com Signed-off-by: Mark Brown --- include/linux/spi/offload/consumer.h | 5 +++++ include/linux/spi/offload/types.h | 19 +++++++++++++++++++ include/linux/spi/spi.h | 3 +++ 3 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/offload/consumer.h b/include/linux/spi/offload/consumer.h index 5a0ec5303d60..cd7d5daa21e6 100644 --- a/include/linux/spi/offload/consumer.h +++ b/include/linux/spi/offload/consumer.h @@ -31,4 +31,9 @@ int spi_offload_trigger_enable(struct spi_offload *offload, void spi_offload_trigger_disable(struct spi_offload *offload, struct spi_offload_trigger *trigger); +struct dma_chan *devm_spi_offload_tx_stream_request_dma_chan(struct device *dev, + struct spi_offload *offload); +struct dma_chan *devm_spi_offload_rx_stream_request_dma_chan(struct device *dev, + struct spi_offload *offload); + #endif /* __LINUX_SPI_OFFLOAD_CONSUMER_H */ diff --git a/include/linux/spi/offload/types.h b/include/linux/spi/offload/types.h index 7476f2073b02..86d0e8cb9495 100644 --- a/include/linux/spi/offload/types.h +++ b/include/linux/spi/offload/types.h @@ -11,6 +11,11 @@ struct device; +/* This is write xfer but TX uses external data stream rather than tx_buf. */ +#define SPI_OFFLOAD_XFER_TX_STREAM BIT(0) +/* This is read xfer but RX uses external data stream rather than rx_buf. */ +#define SPI_OFFLOAD_XFER_RX_STREAM BIT(1) + /* Offload can be triggered by external hardware event. */ #define SPI_OFFLOAD_CAP_TRIGGER BIT(0) /* Offload can record and then play back TX data when triggered. */ @@ -40,6 +45,8 @@ struct spi_offload { void *priv; /** @ops: callbacks for offload support */ const struct spi_offload_ops *ops; + /** @xfer_flags: %SPI_OFFLOAD_XFER_* flags supported by provider */ + u32 xfer_flags; }; enum spi_offload_trigger_type { @@ -75,6 +82,18 @@ struct spi_offload_ops { * given offload instance. */ void (*trigger_disable)(struct spi_offload *offload); + /** + * @tx_stream_request_dma_chan: Optional callback for controllers that + * have an offload where the TX data stream is connected directly to a + * DMA channel. + */ + struct dma_chan *(*tx_stream_request_dma_chan)(struct spi_offload *offload); + /** + * @rx_stream_request_dma_chan: Optional callback for controllers that + * have an offload where the RX data stream is connected directly to a + * DMA channel. + */ + struct dma_chan *(*rx_stream_request_dma_chan)(struct spi_offload *offload); }; #endif /* __LINUX_SPI_OFFLOAD_TYPES_H */ diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 98bdc8c16c20..4c087009cf97 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1093,6 +1093,9 @@ struct spi_transfer { u32 effective_speed_hz; + /* Use %SPI_OFFLOAD_XFER_* from spi-offload.h */ + unsigned int offload_flags; + unsigned int ptp_sts_word_pre; unsigned int ptp_sts_word_post; -- cgit v1.2.3 From c6ad9fdbd44b78f51fa50138247694774ab99e97 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Mon, 27 Jan 2025 10:57:18 -0500 Subject: io_uring,lsm,selinux: add LSM hooks for io_uring_setup() It is desirable to allow LSM to configure accessibility to io_uring because it is a coarse yet very simple way to restrict access to it. So, add an LSM for io_uring_allowed() to guard access to io_uring. Cc: Paul Moore Signed-off-by: Hamza Mahfooz Acked-by: Jens Axboe [PM: merge fuzz due to changes in preceding patches, subj tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 + include/linux/security.h | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index e2f1ce37c41e..9eb313bd0c93 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -455,6 +455,7 @@ LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) LSM_HOOK(int, 0, uring_override_creds, const struct cred *new) LSM_HOOK(int, 0, uring_sqpoll, void) LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd) +LSM_HOOK(int, 0, uring_allowed, void) #endif /* CONFIG_IO_URING */ LSM_HOOK(void, LSM_RET_VOID, initramfs_populated, void) diff --git a/include/linux/security.h b/include/linux/security.h index 980b6c207cad..3e68f8468a22 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2362,6 +2362,7 @@ static inline int security_perf_event_write(struct perf_event *event) extern int security_uring_override_creds(const struct cred *new); extern int security_uring_sqpoll(void); extern int security_uring_cmd(struct io_uring_cmd *ioucmd); +extern int security_uring_allowed(void); #else static inline int security_uring_override_creds(const struct cred *new) { @@ -2375,6 +2376,10 @@ static inline int security_uring_cmd(struct io_uring_cmd *ioucmd) { return 0; } +extern int security_uring_allowed(void) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_IO_URING */ -- cgit v1.2.3 From ea145d530a2db80ff41622af16757f909a435dc9 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 5 Feb 2025 16:31:48 -0800 Subject: bpf: define KF_ARENA_* flags for bpf_arena kfuncs bpf_arena_alloc_pages() and bpf_arena_free_pages() work with the bpf_arena pointers [1], which is indicated by the __arena macro in the kernel source code: #define __arena __attribute__((address_space(1))) However currently this information is absent from the debug data in the vmlinux binary. As a consequence, bpf_arena_* kfuncs declarations in vmlinux.h (produced by bpftool) do not match prototypes expected by the BPF programs attempting to use these functions. Introduce a set of kfunc flags to mark relevant types as bpf_arena pointers. The flags then can be detected by pahole when generating BTF from vmlinux's DWARF, allowing it to emit corresponding BTF type tags for the marked kfuncs. With recently proposed BTF extension [2], these type tags will be processed by bpftool when dumping vmlinux.h, and corresponding compiler attributes will be added to the declarations. [1] https://lwn.net/Articles/961594/ [2] https://lore.kernel.org/bpf/20250130201239.1429648-1-ihor.solodrai@linux.dev/ Suggested-by: Andrii Nakryiko Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20250206003148.2308659-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 2a08a2b55592..ebc0c0c9b944 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -76,6 +76,9 @@ #define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */ #define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */ #define KF_FASTCALL (1 << 12) /* kfunc supports bpf_fastcall protocol */ +#define KF_ARENA_RET (1 << 13) /* kfunc returns an arena pointer */ +#define KF_ARENA_ARG1 (1 << 14) /* kfunc takes an arena pointer as its first argument */ +#define KF_ARENA_ARG2 (1 << 15) /* kfunc takes an arena pointer as its second argument */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the -- cgit v1.2.3 From dbd2e08ff09fd6bd51215b44474899cc1b7b7a16 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 27 Jan 2025 20:30:22 +0100 Subject: iio: gts-helper: export iio_gts_get_total_gain() Export this function in preparation for the fix in veml6030.c, where the total gain can be used to ease the calculation of the processed value of the IIO_LIGHT channel compared to acquiring the scale in NANO. Suggested-by: Matti Vaittinen Signed-off-by: Javier Carrasco Reviewed-by: Matti Vaittinen Link: https://patch.msgid.link/20250127-veml6030-scale-v3-1-4f32ba03df94@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-gts-helper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio-gts-helper.h b/include/linux/iio/iio-gts-helper.h index e5de7a124bad..66f830ab9b49 100644 --- a/include/linux/iio/iio-gts-helper.h +++ b/include/linux/iio/iio-gts-helper.h @@ -208,5 +208,6 @@ int iio_gts_all_avail_scales(struct iio_gts *gts, const int **vals, int *type, int *length); int iio_gts_avail_scales_for_time(struct iio_gts *gts, int time, const int **vals, int *type, int *length); +int iio_gts_get_total_gain(struct iio_gts *gts, int gain, int time); #endif -- cgit v1.2.3 From feb541bfacbe23bf19a96b96db03e6c7505e1b03 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Jan 2025 19:51:20 -0800 Subject: lib/crc64-rocksoft: stop wrapping the crypto API Following what was done for the CRC32 and CRC-T10DIF library functions, get rid of the pointless use of the crypto API and make crc64_rocksoft_update() call into the library directly. This is faster and simpler. Remove crc64_rocksoft() (the version of the function that did not take a 'crc' argument) since it is unused. Reviewed-by: Ard Biesheuvel Reviewed-by: "Martin K. Petersen" Acked-by: Keith Busch Link: https://lore.kernel.org/r/20250130035130.180676-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc64.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc64.h b/include/linux/crc64.h index e044c60d1e61..0a595b272166 100644 --- a/include/linux/crc64.h +++ b/include/linux/crc64.h @@ -12,7 +12,16 @@ u64 __pure crc64_be(u64 crc, const void *p, size_t len); u64 __pure crc64_rocksoft_generic(u64 crc, const void *p, size_t len); -u64 crc64_rocksoft(const unsigned char *buffer, size_t len); -u64 crc64_rocksoft_update(u64 crc, const unsigned char *buffer, size_t len); +/** + * crc64_rocksoft_update - Calculate bitwise Rocksoft CRC64 + * @crc: seed value for computation. 0 for a new CRC calculation, or the + * previous crc64 value if computing incrementally. + * @p: pointer to buffer over which CRC64 is run + * @len: length of buffer @p + */ +static inline u64 crc64_rocksoft_update(u64 crc, const u8 *p, size_t len) +{ + return crc64_rocksoft_generic(crc, p, len); +} #endif /* _LINUX_CRC64_H */ -- cgit v1.2.3 From 0fcec0b73adc5f86597d342062114b77bcf7ec9d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Jan 2025 19:51:21 -0800 Subject: crypto: crc64-rocksoft - remove from crypto API Remove crc64-rocksoft from the crypto API. It has no known user now that the lib is no longer built on top of it. It was also added much more recently than the longstanding crc32 and crc32c. Unlike crc32 and crc32c, crc64-rocksoft is also not mentioned in the dm-integrity documentation and there are no references to it in anywhere in the cryptsetup git repo, so it is unlikely to have any user there either. Also, this CRC variant is named incorrectly; it has nothing to do with Rocksoft and should be called crc64-nvme. That is yet another reason to remove it from the crypto API; we would not want anyone to start depending on the current incorrect algorithm name of crc64-rocksoft. Note that this change temporarily makes this CRC variant not be covered by any tests, as previously it was relying on the crypto self-tests. This will be fixed by adding this CRC variant to crc_kunit. Reviewed-by: Ard Biesheuvel Reviewed-by: "Martin K. Petersen" Acked-by: Keith Busch Link: https://lore.kernel.org/r/20250130035130.180676-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc64.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc64.h b/include/linux/crc64.h index 0a595b272166..7880aeab69d6 100644 --- a/include/linux/crc64.h +++ b/include/linux/crc64.h @@ -7,8 +7,6 @@ #include -#define CRC64_ROCKSOFT_STRING "crc64-rocksoft" - u64 __pure crc64_be(u64 crc, const void *p, size_t len); u64 __pure crc64_rocksoft_generic(u64 crc, const void *p, size_t len); -- cgit v1.2.3 From f6c3f6fb32301dfb35fed3ef8a39de3e13c67ad2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Jan 2025 19:51:22 -0800 Subject: lib/crc64: rename CRC64-Rocksoft to CRC64-NVME This CRC64 variant comes from the NVME NVM Command Set Specification (https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0e-2024.07.29-Ratified.pdf). The "Rocksoft Model CRC Algorithm", published in 1993 and available at https://www.zlib.net/crc_v3.txt, is a generalized CRC algorithm that can calculate any variant of CRC, given a list of parameters such as polynomial, bit order, etc. It is not a CRC variant. The NVME NVM Command Set Specification has a table that gives the "Rocksoft Model Parameters" for the CRC variant it uses. When support for this CRC variant was added to Linux, this table seems to have been misinterpreted as naming the CRC variant the "Rocksoft" CRC. In fact, the table names the CRC variant as the "NVM Express 64b CRC". Most implementations of this CRC variant outside Linux have been calling it CRC64-NVME. Therefore, update Linux to match. While at it, remove the superfluous "update" from the function name, so crc64_rocksoft_update() is now just crc64_nvme(), matching most of the other CRC library functions. Reviewed-by: Ard Biesheuvel Reviewed-by: "Martin K. Petersen" Acked-by: Keith Busch Link: https://lore.kernel.org/r/20250130035130.180676-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc64.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc64.h b/include/linux/crc64.h index 7880aeab69d6..17cf5af3e78e 100644 --- a/include/linux/crc64.h +++ b/include/linux/crc64.h @@ -8,18 +8,21 @@ #include u64 __pure crc64_be(u64 crc, const void *p, size_t len); -u64 __pure crc64_rocksoft_generic(u64 crc, const void *p, size_t len); +u64 __pure crc64_nvme_generic(u64 crc, const void *p, size_t len); /** - * crc64_rocksoft_update - Calculate bitwise Rocksoft CRC64 + * crc64_nvme - Calculate CRC64-NVME * @crc: seed value for computation. 0 for a new CRC calculation, or the * previous crc64 value if computing incrementally. * @p: pointer to buffer over which CRC64 is run * @len: length of buffer @p + * + * This computes the CRC64 defined in the NVME NVM Command Set Specification, + * *including the bitwise inversion at the beginning and end*. */ -static inline u64 crc64_rocksoft_update(u64 crc, const u8 *p, size_t len) +static inline u64 crc64_nvme(u64 crc, const u8 *p, size_t len) { - return crc64_rocksoft_generic(crc, p, len); + return crc64_nvme_generic(crc, p, len); } #endif /* _LINUX_CRC64_H */ -- cgit v1.2.3 From 067bc8717aeee415d6a6294e63b70821846c45c3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Jan 2025 19:51:24 -0800 Subject: lib/crc64: add support for arch-optimized implementations Add support for architecture-optimized implementations of the CRC64 library functions, following the approach taken for the CRC32 and CRC-T10DIF library functions. Also take the opportunity to tweak the function prototypes: - Use 'const void *' for the lib entry points (since this is easier for users) but 'const u8 *' for the underlying arch and generic functions (since this is easier for the implementations of these functions). - Don't bother with __pure. It's an unusual optimization that doesn't help properly written code. It's a weird quirk we can do without. Reviewed-by: Ard Biesheuvel Reviewed-by: "Martin K. Petersen" Acked-by: Keith Busch Link: https://lore.kernel.org/r/20250130035130.180676-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc64.h | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc64.h b/include/linux/crc64.h index 17cf5af3e78e..41de30b907df 100644 --- a/include/linux/crc64.h +++ b/include/linux/crc64.h @@ -7,8 +7,24 @@ #include -u64 __pure crc64_be(u64 crc, const void *p, size_t len); -u64 __pure crc64_nvme_generic(u64 crc, const void *p, size_t len); +u64 crc64_be_arch(u64 crc, const u8 *p, size_t len); +u64 crc64_be_generic(u64 crc, const u8 *p, size_t len); +u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len); +u64 crc64_nvme_generic(u64 crc, const u8 *p, size_t len); + +/** + * crc64_be - Calculate bitwise big-endian ECMA-182 CRC64 + * @crc: seed value for computation. 0 or (u64)~0 for a new CRC calculation, + * or the previous crc64 value if computing incrementally. + * @p: pointer to buffer over which CRC64 is run + * @len: length of buffer @p + */ +static inline u64 crc64_be(u64 crc, const void *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC64_ARCH)) + return crc64_be_arch(crc, p, len); + return crc64_be_generic(crc, p, len); +} /** * crc64_nvme - Calculate CRC64-NVME @@ -20,9 +36,11 @@ u64 __pure crc64_nvme_generic(u64 crc, const void *p, size_t len); * This computes the CRC64 defined in the NVME NVM Command Set Specification, * *including the bitwise inversion at the beginning and end*. */ -static inline u64 crc64_nvme(u64 crc, const u8 *p, size_t len) +static inline u64 crc64_nvme(u64 crc, const void *p, size_t len) { - return crc64_nvme_generic(crc, p, len); + if (IS_ENABLED(CONFIG_CRC64_ARCH)) + return ~crc64_nvme_arch(~crc, p, len); + return ~crc64_nvme_generic(~crc, p, len); } #endif /* _LINUX_CRC64_H */ -- cgit v1.2.3 From 2d7da4f6b0c0445fa0e1b61fd95fa8111eae7fdd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 7 Feb 2025 18:49:07 -0800 Subject: lib/crc32: use void pointer for data Update crc32_le(), crc32_be(), and __crc32c_le() to take the data as a 'const void *' instead of 'const u8 *'. This makes them slightly easier to use, as it can eliminate the need for casts in the calling code. It's the only pointer argument, so there is no possibility for confusion with another pointer argument. Also, some of the CRC library functions, for example crc32c() and crc64_be(), already used 'const void *'. Let's standardize on that, as it seems like a better choice. The underlying base and arch functions continue to use 'const u8 *', as that is often more convenient for the implementation. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250208024911.14936-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index e9bd40056687..e70977014cfd 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -15,14 +15,14 @@ u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len); u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len); u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len); -static inline u32 __pure crc32_le(u32 crc, const u8 *p, size_t len) +static inline u32 __pure crc32_le(u32 crc, const void *p, size_t len) { if (IS_ENABLED(CONFIG_CRC32_ARCH)) return crc32_le_arch(crc, p, len); return crc32_le_base(crc, p, len); } -static inline u32 __pure crc32_be(u32 crc, const u8 *p, size_t len) +static inline u32 __pure crc32_be(u32 crc, const void *p, size_t len) { if (IS_ENABLED(CONFIG_CRC32_ARCH)) return crc32_be_arch(crc, p, len); @@ -30,7 +30,7 @@ static inline u32 __pure crc32_be(u32 crc, const u8 *p, size_t len) } /* TODO: leading underscores should be dropped once callers have been updated */ -static inline u32 __pure __crc32c_le(u32 crc, const u8 *p, size_t len) +static inline u32 __pure __crc32c_le(u32 crc, const void *p, size_t len) { if (IS_ENABLED(CONFIG_CRC32_ARCH)) return crc32c_le_arch(crc, p, len); -- cgit v1.2.3 From bc2736fe7e0b03866b4cb2da320b1aa705b193c0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 7 Feb 2025 18:49:08 -0800 Subject: lib/crc32: don't bother with pure and const function attributes Drop the use of __pure and __attribute_const__ from the CRC32 library functions that had them. Both of these are unusual optimizations that don't help properly written code. They seem more likely to cause problems than have any real benefit. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250208024911.14936-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index e70977014cfd..61a7ec29d633 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -8,21 +8,21 @@ #include #include -u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len); -u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len); -u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len); -u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len); -u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len); -u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len); +u32 crc32_le_arch(u32 crc, const u8 *p, size_t len); +u32 crc32_le_base(u32 crc, const u8 *p, size_t len); +u32 crc32_be_arch(u32 crc, const u8 *p, size_t len); +u32 crc32_be_base(u32 crc, const u8 *p, size_t len); +u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len); +u32 crc32c_le_base(u32 crc, const u8 *p, size_t len); -static inline u32 __pure crc32_le(u32 crc, const void *p, size_t len) +static inline u32 crc32_le(u32 crc, const void *p, size_t len) { if (IS_ENABLED(CONFIG_CRC32_ARCH)) return crc32_le_arch(crc, p, len); return crc32_le_base(crc, p, len); } -static inline u32 __pure crc32_be(u32 crc, const void *p, size_t len) +static inline u32 crc32_be(u32 crc, const void *p, size_t len) { if (IS_ENABLED(CONFIG_CRC32_ARCH)) return crc32_be_arch(crc, p, len); @@ -30,7 +30,7 @@ static inline u32 __pure crc32_be(u32 crc, const void *p, size_t len) } /* TODO: leading underscores should be dropped once callers have been updated */ -static inline u32 __pure __crc32c_le(u32 crc, const void *p, size_t len) +static inline u32 __crc32c_le(u32 crc, const void *p, size_t len) { if (IS_ENABLED(CONFIG_CRC32_ARCH)) return crc32c_le_arch(crc, p, len); @@ -70,7 +70,7 @@ static inline u32 crc32_optimizations(void) { return 0; } * with the same initializer as crc1, and crc2 seed was 0. See * also crc32_combine_test(). */ -u32 __attribute_const__ crc32_le_shift(u32 crc, size_t len); +u32 crc32_le_shift(u32 crc, size_t len); static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) { @@ -95,7 +95,7 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) * seeded with the same initializer as crc1, and crc2 seed * was 0. See also crc32c_combine_test(). */ -u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len); +u32 __crc32c_le_shift(u32 crc, size_t len); static inline u32 __crc32c_le_combine(u32 crc1, u32 crc2, size_t len2) { -- cgit v1.2.3 From 8df36829045a133d558421cc3cf2384a6d9e47cc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 7 Feb 2025 18:49:09 -0800 Subject: lib/crc32: standardize on crc32c() name for Castagnoli CRC32 For historical reasons, the Castagnoli CRC32 is available under 3 names: crc32c(), crc32c_le(), and __crc32c_le(). Most callers use crc32c(). The more verbose versions are not really warranted; there is no "_be" version that the "_le" version needs to be differentiated from, and the leading underscores are pointless. Therefore, let's standardize on just crc32c(). Remove the other two names, and update callers accordingly. Specifically, the new crc32c() comes from what was previously __crc32c_le(), so compared to the old crc32c() it now takes a size_t length rather than unsigned int, and it's now in linux/crc32.h instead of just linux/crc32c.h (which includes linux/crc32.h). Later patches will also rename __crc32c_le_combine(), crc32c_le_base(), and crc32c_le_arch(). Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250208024911.14936-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 5 ++--- include/linux/crc32c.h | 8 -------- 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 61a7ec29d633..bc39b023eac0 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -29,8 +29,7 @@ static inline u32 crc32_be(u32 crc, const void *p, size_t len) return crc32_be_base(crc, p, len); } -/* TODO: leading underscores should be dropped once callers have been updated */ -static inline u32 __crc32c_le(u32 crc, const void *p, size_t len) +static inline u32 crc32c(u32 crc, const void *p, size_t len) { if (IS_ENABLED(CONFIG_CRC32_ARCH)) return crc32c_le_arch(crc, p, len); @@ -45,7 +44,7 @@ static inline u32 __crc32c_le(u32 crc, const void *p, size_t len) */ #define CRC32_LE_OPTIMIZATION BIT(0) /* crc32_le() is optimized */ #define CRC32_BE_OPTIMIZATION BIT(1) /* crc32_be() is optimized */ -#define CRC32C_OPTIMIZATION BIT(2) /* __crc32c_le() is optimized */ +#define CRC32C_OPTIMIZATION BIT(2) /* crc32c() is optimized */ #if IS_ENABLED(CONFIG_CRC32_ARCH) u32 crc32_optimizations(void); #else diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h index 47eb78003c26..b8cff2f4309a 100644 --- a/include/linux/crc32c.h +++ b/include/linux/crc32c.h @@ -4,12 +4,4 @@ #include -static inline u32 crc32c(u32 crc, const void *address, unsigned int length) -{ - return __crc32c_le(crc, address, length); -} - -/* This macro exists for backwards-compatibility. */ -#define crc32c_le crc32c - #endif /* _LINUX_CRC32C_H */ -- cgit v1.2.3 From c64e6570b48ab18675d00344fc3c1f13a86989b5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 7 Feb 2025 18:49:10 -0800 Subject: lib/crc32: rename __crc32c_le_combine() to crc32c_combine() Since the Castagnoli CRC32 is now always just crc32c(), rename __crc32c_le_combine() and __crc32c_le_shift() accordingly. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250208024911.14936-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index bc39b023eac0..535071964f52 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -76,29 +76,27 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) return crc32_le_shift(crc1, len2) ^ crc2; } +u32 crc32c_shift(u32 crc, size_t len); + /** - * __crc32c_le_combine - Combine two crc32c check values into one. For two - * sequences of bytes, seq1 and seq2 with lengths len1 - * and len2, __crc32c_le() check values were calculated - * for each, crc1 and crc2. + * crc32c_combine - Combine two crc32c check values into one. For two sequences + * of bytes, seq1 and seq2 with lengths len1 and len2, crc32c() + * check values were calculated for each, crc1 and crc2. * * @crc1: crc32c of the first block * @crc2: crc32c of the second block * @len2: length of the second block * - * Return: The __crc32c_le() check value of seq1 and seq2 concatenated, - * requiring only crc1, crc2, and len2. Note: If seq_full denotes - * the concatenated memory area of seq1 with seq2, and crc_full - * the __crc32c_le() value of seq_full, then crc_full == - * __crc32c_le_combine(crc1, crc2, len2) when crc_full was - * seeded with the same initializer as crc1, and crc2 seed - * was 0. See also crc32c_combine_test(). + * Return: The crc32c() check value of seq1 and seq2 concatenated, requiring + * only crc1, crc2, and len2. Note: If seq_full denotes the concatenated + * memory area of seq1 with seq2, and crc_full the crc32c() value of + * seq_full, then crc_full == crc32c_combine(crc1, crc2, len2) when + * crc_full was seeded with the same initializer as crc1, and crc2 seed + * was 0. See also crc_combine_test(). */ -u32 __crc32c_le_shift(u32 crc, size_t len); - -static inline u32 __crc32c_le_combine(u32 crc1, u32 crc2, size_t len2) +static inline u32 crc32c_combine(u32 crc1, u32 crc2, size_t len2) { - return __crc32c_le_shift(crc1, len2) ^ crc2; + return crc32c_shift(crc1, len2) ^ crc2; } #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) -- cgit v1.2.3 From 68ea3c2ae0affe68aefab27d55c82be5a45ad882 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 7 Feb 2025 18:49:11 -0800 Subject: lib/crc32: remove "_le" from crc32c base and arch functions Following the standardization on crc32c() as the lib entry point for the Castagnoli CRC32 instead of the previous mix of crc32c(), crc32c_le(), and __crc32c_le(), make the same change to the underlying base and arch functions that implement it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250208024911.14936-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 535071964f52..69c2e8bb3782 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -12,8 +12,8 @@ u32 crc32_le_arch(u32 crc, const u8 *p, size_t len); u32 crc32_le_base(u32 crc, const u8 *p, size_t len); u32 crc32_be_arch(u32 crc, const u8 *p, size_t len); u32 crc32_be_base(u32 crc, const u8 *p, size_t len); -u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len); -u32 crc32c_le_base(u32 crc, const u8 *p, size_t len); +u32 crc32c_arch(u32 crc, const u8 *p, size_t len); +u32 crc32c_base(u32 crc, const u8 *p, size_t len); static inline u32 crc32_le(u32 crc, const void *p, size_t len) { @@ -32,8 +32,8 @@ static inline u32 crc32_be(u32 crc, const void *p, size_t len) static inline u32 crc32c(u32 crc, const void *p, size_t len) { if (IS_ENABLED(CONFIG_CRC32_ARCH)) - return crc32c_le_arch(crc, p, len); - return crc32c_le_base(crc, p, len); + return crc32c_arch(crc, p, len); + return crc32c_base(crc, p, len); } /* -- cgit v1.2.3 From 8522104f75bf1ce33d76ea425185da2a7fba5a70 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 6 Feb 2025 09:38:57 -0800 Subject: crypto: crct10dif - remove from crypto API Remove the "crct10dif" shash algorithm from the crypto API. It has no known user now that the lib is no longer built on top of it. It has no remaining references in kernel code. The only other potential users would be the usual components that allow specifying arbitrary hash algorithms by name, namely AF_ALG and dm-integrity. However there are no indications that "crct10dif" is being used with these components. Debian Code Search and web searches don't find anything relevant, and explicitly grepping the source code of the usual suspects (cryptsetup, libell, iwd) finds no matches either. "crc32" and "crc32c" are used in a few more places, but that doesn't seem to be the case for "crct10dif". crc_t10dif_update() is also tested by crc_kunit now, so the test coverage provided via the crypto self-tests is no longer needed. Also note that the "crct10dif" shash algorithm was inconsistent with the rest of the shash API in that it wrote the digest in CPU endianness, making the resulting byte array differ on little endian vs. big endian platforms. This means it was effectively just built for use by the lib functions, and it was not actually correct to treat it as "just another hash function" that could be dropped in via the shash API. Reviewed-by: Ard Biesheuvel Reviewed-by: "Martin K. Petersen" Link: https://lore.kernel.org/r/20250206173857.39794-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc-t10dif.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h index 16787c1cee21..d0706544fc11 100644 --- a/include/linux/crc-t10dif.h +++ b/include/linux/crc-t10dif.h @@ -4,9 +4,6 @@ #include -#define CRC_T10DIF_DIGEST_SIZE 2 -#define CRC_T10DIF_BLOCK_SIZE 1 - u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len); u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len); -- cgit v1.2.3 From d6104733178293b40044525b06d6a26356934da3 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 14 Jan 2025 13:36:34 +0100 Subject: spinlock: extend guard with spinlock_bh variants Extend guard APIs with missing raw/spinlock_bh variants. Signed-off-by: Christian Marangi Acked-by: Peter Zijlstra (Intel) Signed-off-by: Herbert Xu --- include/linux/spinlock.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 63dd8cf3c3c2..d3561c4a080e 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -548,6 +548,12 @@ DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t, DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock)) +DEFINE_LOCK_GUARD_1(raw_spinlock_bh, raw_spinlock_t, + raw_spin_lock_bh(_T->lock), + raw_spin_unlock_bh(_T->lock)) + +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_bh, _try, raw_spin_trylock_bh(_T->lock)) + DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t, raw_spin_lock_irqsave(_T->lock, _T->flags), raw_spin_unlock_irqrestore(_T->lock, _T->flags), @@ -569,6 +575,13 @@ DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t, DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try, spin_trylock_irq(_T->lock)) +DEFINE_LOCK_GUARD_1(spinlock_bh, spinlock_t, + spin_lock_bh(_T->lock), + spin_unlock_bh(_T->lock)) + +DEFINE_LOCK_GUARD_1_COND(spinlock_bh, _try, + spin_trylock_bh(_T->lock)) + DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t, spin_lock_irqsave(_T->lock, _T->flags), spin_unlock_irqrestore(_T->lock, _T->flags), -- cgit v1.2.3 From b16510a530d1e6ab9683f04f8fb34f2e0f538275 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sun, 2 Feb 2025 20:00:52 +0100 Subject: crypto: ecdsa - Harden against integer overflows in DIV_ROUND_UP() Herbert notes that DIV_ROUND_UP() may overflow unnecessarily if an ecdsa implementation's ->key_size() callback returns an unusually large value. Herbert instead suggests (for a division by 8): X / 8 + !!(X & 7) Based on this formula, introduce a generic DIV_ROUND_UP_POW2() macro and use it in lieu of DIV_ROUND_UP() for ->key_size() return values. Additionally, use the macro in ecc_digits_from_bytes(), whose "nbytes" parameter is a ->key_size() return value in some instances, or a user-specified ASN.1 length in the case of ecdsa_get_signature_rs(). Link: https://lore.kernel.org/r/Z3iElsILmoSu6FuC@gondor.apana.org.au/ Signed-off-by: Lukas Wunner Signed-off-by: Lukas Wunner Signed-off-by: Herbert Xu --- include/linux/math.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/math.h b/include/linux/math.h index f5f18dc3616b..0198c92cbe3e 100644 --- a/include/linux/math.h +++ b/include/linux/math.h @@ -34,6 +34,18 @@ */ #define round_down(x, y) ((x) & ~__round_mask(x, y)) +/** + * DIV_ROUND_UP_POW2 - divide and round up + * @n: numerator + * @d: denominator (must be a power of 2) + * + * Divides @n by @d and rounds up to next multiple of @d (which must be a power + * of 2). Avoids integer overflows that may occur with __KERNEL_DIV_ROUND_UP(). + * Performance is roughly equivalent to __KERNEL_DIV_ROUND_UP(). + */ +#define DIV_ROUND_UP_POW2(n, d) \ + ((n) / (d) + !!((n) & ((d) - 1))) + #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP #define DIV_ROUND_DOWN_ULL(ll, d) \ -- cgit v1.2.3 From 0645b245a2bd1abf1b36b570db47517e5519819c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 8 Feb 2025 09:56:47 -0800 Subject: lib/crc-t10dif: remove crc_t10dif_is_optimized() With the "crct10dif" algorithm having been removed from the crypto API, crc_t10dif_is_optimized() is no longer used. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250208175647.12333-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc-t10dif.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h index d0706544fc11..a559fdff3f7e 100644 --- a/include/linux/crc-t10dif.h +++ b/include/linux/crc-t10dif.h @@ -19,13 +19,4 @@ static inline u16 crc_t10dif(const u8 *p, size_t len) return crc_t10dif_update(0, p, len); } -#if IS_ENABLED(CONFIG_CRC_T10DIF_ARCH) -bool crc_t10dif_is_optimized(void); -#else -static inline bool crc_t10dif_is_optimized(void) -{ - return false; -} -#endif - #endif -- cgit v1.2.3 From 9748cb2dc393e9095c39d6de0786c7b4e07b2530 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 6 Feb 2025 16:42:43 +1100 Subject: VFS: repack DENTRY_ flags. Bits 13, 23, 24, and 27 are not used. Move all those holes to the end. Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250206054504.2950516-7-neilb@suse.de Signed-off-by: Christian Brauner --- include/linux/dcache.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4afb60365675..bb7216b22091 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -203,34 +203,34 @@ struct dentry_operations { #define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) +#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(13) /* Parent inode is watched by some fsnotify listener */ -#define DCACHE_DENTRY_KILLED BIT(15) +#define DCACHE_DENTRY_KILLED BIT(14) -#define DCACHE_MOUNTED BIT(16) /* is a mountpoint */ -#define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */ -#define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */ +#define DCACHE_MOUNTED BIT(15) /* is a mountpoint */ +#define DCACHE_NEED_AUTOMOUNT BIT(16) /* handle automount on this dir */ +#define DCACHE_MANAGE_TRANSIT BIT(17) /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) -#define DCACHE_LRU_LIST BIT(19) +#define DCACHE_LRU_LIST BIT(18) -#define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ -#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */ -#define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ -#define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ -#define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */ -#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ -#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ +#define DCACHE_ENTRY_TYPE (7 << 19) /* bits 19..21 are for storing type: */ +#define DCACHE_MISS_TYPE (0 << 19) /* Negative dentry */ +#define DCACHE_WHITEOUT_TYPE (1 << 19) /* Whiteout dentry (stop pathwalk) */ +#define DCACHE_DIRECTORY_TYPE (2 << 19) /* Normal directory */ +#define DCACHE_AUTODIR_TYPE (3 << 19) /* Lookupless directory (presumed automount) */ +#define DCACHE_REGULAR_TYPE (4 << 19) /* Regular file type */ +#define DCACHE_SPECIAL_TYPE (5 << 19) /* Other file type */ +#define DCACHE_SYMLINK_TYPE (6 << 19) /* Symlink */ -#define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ -#define DCACHE_OP_REAL BIT(26) +#define DCACHE_NOKEY_NAME BIT(22) /* Encrypted name encoded without key */ +#define DCACHE_OP_REAL BIT(23) -#define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */ -#define DCACHE_DENTRY_CURSOR BIT(29) -#define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */ +#define DCACHE_PAR_LOOKUP BIT(24) /* being looked up (with parent locked shared) */ +#define DCACHE_DENTRY_CURSOR BIT(25) +#define DCACHE_NORCU BIT(26) /* No RCU delay for freeing */ extern seqlock_t rename_lock; -- cgit v1.2.3 From 2c3230fb8db9bf04d97a907f2fb86adb1e74e431 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 6 Feb 2025 16:42:44 +1100 Subject: VFS: repack LOOKUP_ bit flags. The LOOKUP_ bits are not in order, which can make it awkward when adding new bits. Two bits have recently been added to the end which makes them look like "scoping flags", but in fact they aren't. Also LOOKUP_PARENT is described as "internal use only" but is used in fs/nfs/ This patch: - Moves these three flags into the "pathwalk mode" section - changes all bits to use the BIT(n) macro - Allocates bits in order leaving gaps between the sections, and documents those gaps. Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250206054504.2950516-8-neilb@suse.de Signed-off-by: Christian Brauner --- include/linux/namei.h | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index 8ec8fed3bce8..e3042176cdf4 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -18,35 +18,36 @@ enum { MAX_NESTED_LINKS = 8 }; enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* pathwalk mode */ -#define LOOKUP_FOLLOW 0x0001 /* follow links at the end */ -#define LOOKUP_DIRECTORY 0x0002 /* require a directory */ -#define LOOKUP_AUTOMOUNT 0x0004 /* force terminal automount */ -#define LOOKUP_EMPTY 0x4000 /* accept empty path [user_... only] */ -#define LOOKUP_DOWN 0x8000 /* follow mounts in the starting point */ -#define LOOKUP_MOUNTPOINT 0x0080 /* follow mounts in the end */ - -#define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */ -#define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */ +#define LOOKUP_FOLLOW BIT(0) /* follow links at the end */ +#define LOOKUP_DIRECTORY BIT(1) /* require a directory */ +#define LOOKUP_AUTOMOUNT BIT(2) /* force terminal automount */ +#define LOOKUP_EMPTY BIT(3) /* accept empty path [user_... only] */ +#define LOOKUP_LINKAT_EMPTY BIT(4) /* Linkat request with empty path. */ +#define LOOKUP_DOWN BIT(5) /* follow mounts in the starting point */ +#define LOOKUP_MOUNTPOINT BIT(6) /* follow mounts in the end */ +#define LOOKUP_REVAL BIT(7) /* tell ->d_revalidate() to trust no cache */ +#define LOOKUP_RCU BIT(8) /* RCU pathwalk mode; semi-internal */ +#define LOOKUP_CACHED BIT(9) /* Only do cached lookup */ +#define LOOKUP_PARENT BIT(10) /* Looking up final parent in path */ +/* 5 spare bits for pathwalk */ /* These tell filesystem methods that we are dealing with the final component... */ -#define LOOKUP_OPEN 0x0100 /* ... in open */ -#define LOOKUP_CREATE 0x0200 /* ... in object creation */ -#define LOOKUP_EXCL 0x0400 /* ... in exclusive creation */ -#define LOOKUP_RENAME_TARGET 0x0800 /* ... in destination of rename() */ +#define LOOKUP_OPEN BIT(16) /* ... in open */ +#define LOOKUP_CREATE BIT(17) /* ... in object creation */ +#define LOOKUP_EXCL BIT(18) /* ... in target must not exist */ +#define LOOKUP_RENAME_TARGET BIT(19) /* ... in destination of rename() */ -/* internal use only */ -#define LOOKUP_PARENT 0x0010 +/* 4 spare bits for intent */ /* Scoping flags for lookup. */ -#define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ -#define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ -#define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ -#define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ -#define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */ -#define LOOKUP_CACHED 0x200000 /* Only do cached lookup */ -#define LOOKUP_LINKAT_EMPTY 0x400000 /* Linkat request with empty path. */ +#define LOOKUP_NO_SYMLINKS BIT(24) /* No symlink crossing. */ +#define LOOKUP_NO_MAGICLINKS BIT(25) /* No nd_jump_link() crossing. */ +#define LOOKUP_NO_XDEV BIT(26) /* No mountpoint crossing. */ +#define LOOKUP_BENEATH BIT(27) /* No escaping from starting point. */ +#define LOOKUP_IN_ROOT BIT(28) /* Treat dirfd as fs root. */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) +/* 3 spare bits for scoping */ extern int path_pts(struct path *path); -- cgit v1.2.3 From 31d43141d13aa63587f140884b1f667800ce4e1d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Feb 2025 16:57:09 +0200 Subject: dmaengine: Replace dma_request_slave_channel() by dma_request_chan() Replace dma_request_slave_channel() by dma_request_chan() as suggested since the former is deprecated. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250205145757.889247-2-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 346251bf1026..83cbf7197a76 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -1639,8 +1639,8 @@ static inline struct dma_chan { struct dma_chan *chan; - chan = dma_request_slave_channel(dev, name); - if (chan) + chan = dma_request_chan(dev, name); + if (!IS_ERR(chan)) return chan; if (!fn || !fn_param) -- cgit v1.2.3 From 1c83d3dfa0905590408595560629627cba4f9261 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Feb 2025 16:57:10 +0200 Subject: dmaengine: Use dma_request_channel() instead of __dma_request_channel() Reduce use of internal __dma_request_channel() function in public dmaengine.h. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250205145757.889247-3-andriy.shevchenko@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 83cbf7197a76..a360e0330436 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -1646,7 +1646,7 @@ static inline struct dma_chan if (!fn || !fn_param) return NULL; - return __dma_request_channel(&mask, fn, fn_param, NULL); + return dma_request_channel(mask, fn, fn_param); } static inline char * -- cgit v1.2.3 From abb0ea1923a68ec8f45a7615ebad1fc87ea06da6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:44 -0500 Subject: iomap: factor out iomap length helper In preparation to support more granular iomap iter advancing, factor the pos/len values as parameters to length calculation. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-2-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 022d7f338c68..feb748eb6294 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -238,18 +238,33 @@ struct iomap_iter { int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); /** - * iomap_length - length of the current iomap iteration + * iomap_length_trim - trimmed length of the current iomap iteration * @iter: iteration structure + * @pos: File position to trim from. + * @len: Length of the mapping to trim to. * - * Returns the length that the operation applies to for the current iteration. + * Returns a trimmed length that the operation applies to for the current + * iteration. */ -static inline u64 iomap_length(const struct iomap_iter *iter) +static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos, + u64 len) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); - return min(iter->len, end - iter->pos); + return min(len, end - pos); +} + +/** + * iomap_length - length of the current iomap iteration + * @iter: iteration structure + * + * Returns the length that the operation applies to for the current iteration. + */ +static inline u64 iomap_length(const struct iomap_iter *iter) +{ + return iomap_length_trim(iter, iter->pos, iter->len); } /** -- cgit v1.2.3 From b51d30ff51f9c325b65c8cd66ff6590530b14041 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:49 -0500 Subject: iomap: export iomap_iter_advance() and return remaining length As a final step for generic iter advance, export the helper and update it to return the remaining length of the current iteration after the advance. This will usually be 0 in the iomap_iter() case, but will be useful for the various operations that iterate on their own and will be updated to advance as they progress. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-7-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index feb748eb6294..eed06ffdcfbd 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -236,6 +236,7 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); +int iomap_iter_advance(struct iomap_iter *iter, u64 *count); /** * iomap_length_trim - trimmed length of the current iomap iteration -- cgit v1.2.3 From bc264fea0f6f230e56f876cc4266b1982d20f35d Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 7 Feb 2025 09:32:50 -0500 Subject: iomap: support incremental iomap_iter advances The current iomap_iter iteration model reads the mapping from the filesystem, processes the subrange of the operation associated with the current mapping, and returns the number of bytes processed back to the iteration code. The latter advances the position and remaining length of the iter in preparation for the next iteration. At the _iter() handler level, this tends to produce a processing loop where the local code pulls the current position and remaining length out of the iter, iterates it locally based on file offset, and then breaks out when the associated range has been fully processed. This works well enough for current handlers, but upcoming enhancements require a bit more flexibility in certain situations. Enhancements for zero range will lead to a situation where the processing loop is no longer a pure ascending offset walk, but rather dictated by pagecache state and folio lookup. Since folio lookup and write preparation occur at different levels, it is more difficult to manage position and length outside of the iter. To provide more flexibility to certain iomap operations, introduce support for incremental iomap_iter advances from within the operation itself. This allows more granular advances for operations that might not use the typical file offset based walk. Note that the semantics for operations that use incremental advances is slightly different than traditional operations. Operations that advance the iter directly are expected to return success or failure (i.e. 0 or negative error code) in iter.processed rather than the number of bytes processed. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250207143253.314068-8-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index eed06ffdcfbd..e180dacf434c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -218,8 +218,11 @@ struct iomap_ops { * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. - * @processed: The number of bytes processed by the body in the most recent - * iteration, or a negative errno. 0 causes the iteration to stop. + * @iter_start_pos: The original start pos for the current iomap. Used for + * incremental iter advance. + * @processed: The number of bytes the most recent iteration needs iomap_iter() + * to advance the iter, zero if the iter was already advanced, or a + * negative errno for an error during the operation. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -228,6 +231,7 @@ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; + loff_t iter_start_pos; s64 processed; unsigned flags; struct iomap iomap; -- cgit v1.2.3 From 06b9e91425b26b5e782f82a043a3f6c6354947c2 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 18 Dec 2024 22:54:13 +0800 Subject: jbd2: remove unused transaction->t_private_list After we remove ext4 journal callback, transaction->t_private_list is not used anymore. Just remove unused transaction->t_private_list. Signed-off-by: Kemeng Shi Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20241218145414.1422946-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 561025b4f3d9..053444f7a1d0 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -700,12 +700,6 @@ struct transaction_s /* Disk flush needs to be sent to fs partition [no locking] */ int t_need_data_flush; - - /* - * For use by the filesystem to store fs-specific data - * structures associated with the transaction - */ - struct list_head t_private_list; }; struct transaction_run_stats_s { -- cgit v1.2.3 From 4632cd0ec3faa568660a194d5a93450d9cbf7aa5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sun, 9 Feb 2025 22:32:15 -0500 Subject: lsm: fix a missing security_uring_allowed() prototype The !CONFIG_SECURITY dummy function was declared as an "extern int" instead of "static inline" (likely a copy-n-paste error), which was was causing the compiler to complain about a missing prototype. This patch converts the dummy definition over to a "static inline" to resolve the compiler problems. Reported-by: kernel test robot Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/oe-kbuild-all/202502081912.TeokpAAe-lkp@intel.com/ Fixes: c6ad9fdbd44b ("io_uring,lsm,selinux: add LSM hooks for io_uring_setup()") Signed-off-by: Paul Moore --- include/linux/security.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 3e68f8468a22..27f64a9747f8 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2376,7 +2376,7 @@ static inline int security_uring_cmd(struct io_uring_cmd *ioucmd) { return 0; } -extern int security_uring_allowed(void) +static inline int security_uring_allowed(void) { return 0; } -- cgit v1.2.3 From 07d0aa9393abc8fd64d0a174edfb68c5808187e4 Mon Sep 17 00:00:00 2001 From: Martin Kurbanov Date: Mon, 10 Feb 2025 17:34:13 +0300 Subject: mtd: spinand: make spinand_{read,write}_page global Change these functions from static to global so that to use them later in OTP operations. Since reading OTP pages is no different from reading pages from the main area. Signed-off-by: Martin Kurbanov Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 0da8a1c7740e..34ddd2441fc9 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -588,4 +588,10 @@ int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val); int spinand_write_reg_op(struct spinand_device *spinand, u8 reg, u8 val); int spinand_select_target(struct spinand_device *spinand, unsigned int target); +int spinand_read_page(struct spinand_device *spinand, + const struct nand_page_io_req *req); + +int spinand_write_page(struct spinand_device *spinand, + const struct nand_page_io_req *req); + #endif /* __LINUX_MTD_SPINAND_H */ -- cgit v1.2.3 From c06b1f753bea40a282f29a9383fcf36b12323108 Mon Sep 17 00:00:00 2001 From: Martin Kurbanov Date: Mon, 10 Feb 2025 17:34:14 +0300 Subject: mtd: spinand: add OTP support The MTD subsystem already supports accessing two OTP areas: user and factory. User areas can be written by the user. This patch provides the SPINAND_FACT_OTP_INFO and SPINAND_USER_OTP_INFO macros to add parameters to spinand_info. To implement OTP operations, the client (flash driver) is provided with callbacks for user area: .read(), .write(), .info(), .lock(), .erase(); and for factory area: .read(), .info(); Signed-off-by: Martin Kurbanov Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 93 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 34ddd2441fc9..f4c965ae2f65 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -374,6 +374,67 @@ struct spinand_ondie_ecc_conf { u8 status; }; +/** + * struct spinand_otp_layout - structure to describe the SPI NAND OTP area + * @npages: number of pages in the OTP + * @start_page: start page of the user/factory OTP area. + */ +struct spinand_otp_layout { + unsigned int npages; + unsigned int start_page; +}; + +/** + * struct spinand_fact_otp_ops - SPI NAND OTP methods for factory area + * @info: get the OTP area information + * @read: read from the SPI NAND OTP area + */ +struct spinand_fact_otp_ops { + int (*info)(struct spinand_device *spinand, size_t len, + struct otp_info *buf, size_t *retlen); + int (*read)(struct spinand_device *spinand, loff_t from, size_t len, + size_t *retlen, u8 *buf); +}; + +/** + * struct spinand_user_otp_ops - SPI NAND OTP methods for user area + * @info: get the OTP area information + * @lock: lock an OTP region + * @erase: erase an OTP region + * @read: read from the SPI NAND OTP area + * @write: write to the SPI NAND OTP area + */ +struct spinand_user_otp_ops { + int (*info)(struct spinand_device *spinand, size_t len, + struct otp_info *buf, size_t *retlen); + int (*lock)(struct spinand_device *spinand, loff_t from, size_t len); + int (*erase)(struct spinand_device *spinand, loff_t from, size_t len); + int (*read)(struct spinand_device *spinand, loff_t from, size_t len, + size_t *retlen, u8 *buf); + int (*write)(struct spinand_device *spinand, loff_t from, size_t len, + size_t *retlen, const u8 *buf); +}; + +/** + * struct spinand_fact_otp - SPI NAND OTP grouping structure for factory area + * @layout: OTP region layout + * @ops: OTP access ops + */ +struct spinand_fact_otp { + const struct spinand_otp_layout layout; + const struct spinand_fact_otp_ops *ops; +}; + +/** + * struct spinand_user_otp - SPI NAND OTP grouping structure for user area + * @layout: OTP region layout + * @ops: OTP access ops + */ +struct spinand_user_otp { + const struct spinand_otp_layout layout; + const struct spinand_user_otp_ops *ops; +}; + /** * struct spinand_info - Structure used to describe SPI NAND chips * @model: model name @@ -389,6 +450,8 @@ struct spinand_ondie_ecc_conf { * @select_target: function used to select a target/die. Required only for * multi-die chips * @set_cont_read: enable/disable continuous cached reads + * @fact_otp: SPI NAND factory OTP info. + * @user_otp: SPI NAND user OTP info. * * Each SPI NAND manufacturer driver should have a spinand_info table * describing all the chips supported by the driver. @@ -409,6 +472,8 @@ struct spinand_info { unsigned int target); int (*set_cont_read)(struct spinand_device *spinand, bool enable); + struct spinand_fact_otp fact_otp; + struct spinand_user_otp user_otp; }; #define SPINAND_ID(__method, ...) \ @@ -437,6 +502,24 @@ struct spinand_info { #define SPINAND_CONT_READ(__set_cont_read) \ .set_cont_read = __set_cont_read, +#define SPINAND_FACT_OTP_INFO(__npages, __start_page, __ops) \ + .fact_otp = { \ + .layout = { \ + .npages = __npages, \ + .start_page = __start_page, \ + }, \ + .ops = __ops, \ + } + +#define SPINAND_USER_OTP_INFO(__npages, __start_page, __ops) \ + .user_otp = { \ + .layout = { \ + .npages = __npages, \ + .start_page = __start_page, \ + }, \ + .ops = __ops, \ + } + #define SPINAND_INFO(__model, __id, __memorg, __eccreq, __op_variants, \ __flags, ...) \ { \ @@ -487,6 +570,8 @@ struct spinand_dirmap { * actually relevant to enable this feature. * @set_cont_read: Enable/disable the continuous read feature * @priv: manufacturer private data + * @fact_otp: SPI NAND factory OTP info. + * @user_otp: SPI NAND user OTP info. */ struct spinand_device { struct nand_device base; @@ -519,6 +604,9 @@ struct spinand_device { bool cont_read_possible; int (*set_cont_read)(struct spinand_device *spinand, bool enable); + + const struct spinand_fact_otp *fact_otp; + const struct spinand_user_otp *user_otp; }; /** @@ -594,4 +682,9 @@ int spinand_read_page(struct spinand_device *spinand, int spinand_write_page(struct spinand_device *spinand, const struct nand_page_io_req *req); +size_t spinand_fact_otp_size(struct spinand_device *spinand); +size_t spinand_user_otp_size(struct spinand_device *spinand); + +int spinand_set_mtd_otp_ops(struct spinand_device *spinand); + #endif /* __LINUX_MTD_SPINAND_H */ -- cgit v1.2.3 From e278b8c73b0526f8e3ee22f4827c8fe07c2109ba Mon Sep 17 00:00:00 2001 From: Martin Kurbanov Date: Mon, 10 Feb 2025 17:34:15 +0300 Subject: mtd: spinand: make spinand_{wait,otp_page_size} global Change the functions spinand_wait() and spinand_otp_page_size() from static to global so that SPI NAND flash drivers don't duplicate it. Signed-off-by: Martin Kurbanov Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index f4c965ae2f65..597d28339d15 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -676,12 +676,16 @@ int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val); int spinand_write_reg_op(struct spinand_device *spinand, u8 reg, u8 val); int spinand_select_target(struct spinand_device *spinand, unsigned int target); +int spinand_wait(struct spinand_device *spinand, unsigned long initial_delay_us, + unsigned long poll_delay_us, u8 *s); + int spinand_read_page(struct spinand_device *spinand, const struct nand_page_io_req *req); int spinand_write_page(struct spinand_device *spinand, const struct nand_page_io_req *req); +size_t spinand_otp_page_size(struct spinand_device *spinand); size_t spinand_fact_otp_size(struct spinand_device *spinand); size_t spinand_user_otp_size(struct spinand_device *spinand); -- cgit v1.2.3 From 9ad2857c82d56017402256fd3e2ed401cc7f6fb9 Mon Sep 17 00:00:00 2001 From: Martin Kurbanov Date: Mon, 10 Feb 2025 17:34:16 +0300 Subject: mtd: spinand: otp: add helpers functions The global functions spinand_otp_read() and spinand_otp_write() have been introduced. Since most SPI-NAND flashes read/write OTP in the same way, let's define global functions to avoid code duplication. Signed-off-by: Martin Kurbanov Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 597d28339d15..73424405232a 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -689,6 +689,13 @@ size_t spinand_otp_page_size(struct spinand_device *spinand); size_t spinand_fact_otp_size(struct spinand_device *spinand); size_t spinand_user_otp_size(struct spinand_device *spinand); +int spinand_fact_otp_read(struct spinand_device *spinand, loff_t ofs, + size_t len, size_t *retlen, u8 *buf); +int spinand_user_otp_read(struct spinand_device *spinand, loff_t ofs, + size_t len, size_t *retlen, u8 *buf); +int spinand_user_otp_write(struct spinand_device *spinand, loff_t ofs, + size_t len, size_t *retlen, const u8 *buf); + int spinand_set_mtd_otp_ops(struct spinand_device *spinand); #endif /* __LINUX_MTD_SPINAND_H */ -- cgit v1.2.3 From ebc4176551cdd021d02f4d2ed734e7b65e44442a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Feb 2025 22:00:35 -0800 Subject: blk-crypto: add basic hardware-wrapped key support To prevent keys from being compromised if an attacker acquires read access to kernel memory, some inline encryption hardware can accept keys which are wrapped by a per-boot hardware-internal key. This avoids needing to keep the raw keys in kernel memory, without limiting the number of keys that can be used. Such hardware also supports deriving a "software secret" for cryptographic tasks that can't be handled by inline encryption; this is needed for fscrypt to work properly. To support this hardware, allow struct blk_crypto_key to represent a hardware-wrapped key as an alternative to a raw key, and make drivers set flags in struct blk_crypto_profile to indicate which types of keys they support. Also add the ->derive_sw_secret() low-level operation, which drivers supporting wrapped keys must implement. For more information, see the detailed documentation which this patch adds to Documentation/block/inline-encryption.rst. Signed-off-by: Eric Biggers Tested-by: Bartosz Golaszewski # sm8650 Link: https://lore.kernel.org/r/20250204060041.409950-2-ebiggers@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-crypto-profile.h | 20 +++++++++++ include/linux/blk-crypto.h | 72 ++++++++++++++++++++++++++++++++++---- 2 files changed, 85 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index 90ab33cb5d0e..7764b4f7b45b 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -57,6 +57,20 @@ struct blk_crypto_ll_ops { int (*keyslot_evict)(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot); + + /** + * @derive_sw_secret: Derive the software secret from a hardware-wrapped + * key in ephemerally-wrapped form. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * Must return 0 on success, -EBADMSG if the key is invalid, or another + * -errno code on other errors. + */ + int (*derive_sw_secret)(struct blk_crypto_profile *profile, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); }; /** @@ -84,6 +98,12 @@ struct blk_crypto_profile { */ unsigned int max_dun_bytes_supported; + /** + * @key_types_supported: A bitmask of the supported key types: + * BLK_CRYPTO_KEY_TYPE_RAW and/or BLK_CRYPTO_KEY_TYPE_HW_WRAPPED. + */ + unsigned int key_types_supported; + /** * @modes_supported: Array of bitmasks that specifies whether each * combination of crypto mode and data unit size is supported. diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 5e5822c18ee4..81f932b3ea37 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -6,6 +6,7 @@ #ifndef __LINUX_BLK_CRYPTO_H #define __LINUX_BLK_CRYPTO_H +#include #include enum blk_crypto_mode_num { @@ -17,7 +18,55 @@ enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_MAX, }; -#define BLK_CRYPTO_MAX_KEY_SIZE 64 +/* + * Supported types of keys. Must be bitflags due to their use in + * blk_crypto_profile::key_types_supported. + */ +enum blk_crypto_key_type { + /* + * Raw keys (i.e. "software keys"). These keys are simply kept in raw, + * plaintext form in kernel memory. + */ + BLK_CRYPTO_KEY_TYPE_RAW = 0x1, + + /* + * Hardware-wrapped keys. These keys are only present in kernel memory + * in ephemerally-wrapped form, and they can only be unwrapped by + * dedicated hardware. For details, see the "Hardware-wrapped keys" + * section of Documentation/block/inline-encryption.rst. + */ + BLK_CRYPTO_KEY_TYPE_HW_WRAPPED = 0x2, +}; + +/* + * Currently the maximum raw key size is 64 bytes, as that is the key size of + * BLK_ENCRYPTION_MODE_AES_256_XTS which takes the longest key. + * + * The maximum hardware-wrapped key size depends on the hardware's key wrapping + * algorithm, which is a hardware implementation detail, so it isn't precisely + * specified. But currently 128 bytes is plenty in practice. Implementations + * are recommended to wrap a 32-byte key for the hardware KDF with AES-256-GCM, + * which should result in a size closer to 64 bytes than 128. + * + * Both of these values can trivially be increased if ever needed. + */ +#define BLK_CRYPTO_MAX_RAW_KEY_SIZE 64 +#define BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE 128 + +#define BLK_CRYPTO_MAX_ANY_KEY_SIZE \ + MAX(BLK_CRYPTO_MAX_RAW_KEY_SIZE, BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) + +/* + * Size of the "software secret" which can be derived from a hardware-wrapped + * key. This is currently always 32 bytes. Note, the choice of 32 bytes + * assumes that the software secret is only used directly for algorithms that + * don't require more than a 256-bit key to get the desired security strength. + * If it were to be used e.g. directly as an AES-256-XTS key, then this would + * need to be increased (which is possible if hardware supports it, but care + * would need to be taken to avoid breaking users who need exactly 32 bytes). + */ +#define BLK_CRYPTO_SW_SECRET_SIZE 32 + /** * struct blk_crypto_config - an inline encryption key's crypto configuration * @crypto_mode: encryption algorithm this key is for @@ -26,20 +75,23 @@ enum blk_crypto_mode_num { * ciphertext. This is always a power of 2. It might be e.g. the * filesystem block size or the disk sector size. * @dun_bytes: the maximum number of bytes of DUN used when using this key + * @key_type: the type of this key -- either raw or hardware-wrapped */ struct blk_crypto_config { enum blk_crypto_mode_num crypto_mode; unsigned int data_unit_size; unsigned int dun_bytes; + enum blk_crypto_key_type key_type; }; /** * struct blk_crypto_key - an inline encryption key - * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this - * key + * @crypto_cfg: the crypto mode, data unit size, key type, and other + * characteristics of this key and how it will be used * @data_unit_size_bits: log2 of data_unit_size - * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode) - * @raw: the raw bytes of this key. Only the first @size bytes are used. + * @size: size of this key in bytes. The size of a raw key is fixed for a given + * crypto mode, but the size of a hardware-wrapped key can vary. + * @bytes: the bytes of this key. Only the first @size bytes are significant. * * A blk_crypto_key is immutable once created, and many bios can reference it at * the same time. It must not be freed until all bios using it have completed @@ -49,7 +101,7 @@ struct blk_crypto_key { struct blk_crypto_config crypto_cfg; unsigned int data_unit_size_bits; unsigned int size; - u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; + u8 bytes[BLK_CRYPTO_MAX_ANY_KEY_SIZE]; }; #define BLK_CRYPTO_MAX_IV_SIZE 32 @@ -87,7 +139,9 @@ bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); -int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, +int blk_crypto_init_key(struct blk_crypto_key *blk_key, + const u8 *key_bytes, size_t key_size, + enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size); @@ -103,6 +157,10 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev, bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); +int blk_crypto_derive_sw_secret(struct block_device *bdev, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool bio_has_crypt_ctx(struct bio *bio) -- cgit v1.2.3 From 1ebd4a3c095cd538d3c1c7c12738ef47d8e71f96 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 3 Feb 2025 22:00:37 -0800 Subject: blk-crypto: add ioctls to create and prepare hardware-wrapped keys Until this point, the kernel can use hardware-wrapped keys to do encryption if userspace provides one -- specifically a key in ephemerally-wrapped form. However, no generic way has been provided for userspace to get such a key in the first place. Getting such a key is a two-step process. First, the key needs to be imported from a raw key or generated by the hardware, producing a key in long-term wrapped form. This happens once in the whole lifetime of the key. Second, the long-term wrapped key needs to be converted into ephemerally-wrapped form. This happens each time the key is "unlocked". In Android, these operations are supported in a generic way through KeyMint, a userspace abstraction layer. However, that method is Android-specific and can't be used on other Linux systems, may rely on proprietary libraries, and also misleads people into supporting KeyMint features like rollback resistance that make sense for other KeyMint keys but don't make sense for hardware-wrapped inline encryption keys. Therefore, this patch provides a generic kernel interface for these operations by introducing new block device ioctls: - BLKCRYPTOIMPORTKEY: convert a raw key to long-term wrapped form. - BLKCRYPTOGENERATEKEY: have the hardware generate a new key, then return it in long-term wrapped form. - BLKCRYPTOPREPAREKEY: convert a key from long-term wrapped form to ephemerally-wrapped form. These ioctls are implemented using new operations in blk_crypto_ll_ops. Signed-off-by: Eric Biggers Tested-by: Bartosz Golaszewski # sm8650 Link: https://lore.kernel.org/r/20250204060041.409950-4-ebiggers@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-crypto-profile.h | 53 ++++++++++++++++++++++++++++++++++++++ include/linux/blk-crypto.h | 1 + 2 files changed, 54 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index 7764b4f7b45b..4f39e9cd7576 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -71,6 +71,48 @@ struct blk_crypto_ll_ops { int (*derive_sw_secret)(struct blk_crypto_profile *profile, const u8 *eph_key, size_t eph_key_size, u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); + + /** + * @import_key: Create a hardware-wrapped key by importing a raw key. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * On success, must write the new key in long-term wrapped form to + * @lt_key and return its size in bytes. On failure, must return a + * -errno value. + */ + int (*import_key)(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + + /** + * @generate_key: Generate a hardware-wrapped key. + * + * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED + * is supported. + * + * On success, must write the new key in long-term wrapped form to + * @lt_key and return its size in bytes. On failure, must return a + * -errno value. + */ + int (*generate_key)(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + + /** + * @prepare_key: Prepare a hardware-wrapped key to be used. + * + * Prepare a hardware-wrapped key to be used by converting it from + * long-term wrapped form to ephemerally-wrapped form. This only needs + * to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED is supported. + * + * On success, must write the key in ephemerally-wrapped form to + * @eph_key and return its size in bytes. On failure, must return + * -EBADMSG if the key is invalid, or another -errno on other error. + */ + int (*prepare_key)(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); }; /** @@ -163,6 +205,17 @@ void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); +int blk_crypto_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + +int blk_crypto_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + +int blk_crypto_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]); + void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, const struct blk_crypto_profile *child); diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 81f932b3ea37..58b0c5254a67 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -8,6 +8,7 @@ #include #include +#include enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, -- cgit v1.2.3 From b37778bec82ba82058912ca069881397197cd3d5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Jan 2025 16:03:07 +0100 Subject: seccomp: fix the __secure_computing() stub for !HAVE_ARCH_SECCOMP_FILTER Depending on CONFIG_HAVE_ARCH_SECCOMP_FILTER, __secure_computing(NULL) will crash or not. This is not consistent/safe, especially considering that after the previous change __secure_computing(sd) is always called with sd == NULL. Fortunately, if CONFIG_HAVE_ARCH_SECCOMP_FILTER=n, __secure_computing() has no callers, these architectures use secure_computing_strict(). Yet it make sense make __secure_computing(NULL) safe in this case. Note also that with this change we can unexport secure_computing_strict() and change the current callers to use __secure_computing(NULL). Fixes: 8cf8dfceebda ("seccomp: Stub for !HAVE_ARCH_SECCOMP_FILTER") Signed-off-by: Oleg Nesterov Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250128150307.GA15325@redhat.com Signed-off-by: Kees Cook --- include/linux/seccomp.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index e45531455d3b..d55949071c30 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -22,8 +22,9 @@ #include #include -#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER extern int __secure_computing(const struct seccomp_data *sd); + +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER static inline int secure_computing(void) { if (unlikely(test_syscall_work(SECCOMP))) @@ -32,11 +33,6 @@ static inline int secure_computing(void) } #else extern void secure_computing_strict(int this_syscall); -static inline int __secure_computing(const struct seccomp_data *sd) -{ - secure_computing_strict(sd->nr); - return 0; -} #endif extern long prctl_get_seccomp(void); -- cgit v1.2.3 From 1027cd8084bbcdf80d8a096d5e2c6da91402fc3c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Jan 2025 16:03:13 +0100 Subject: seccomp: remove the 'sd' argument from __secure_computing() After the previous changes 'sd' is always NULL. Signed-off-by: Oleg Nesterov Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250128150313.GA15336@redhat.com Signed-off-by: Kees Cook --- include/linux/seccomp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index d55949071c30..9b959972bf4a 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -22,13 +22,13 @@ #include #include -extern int __secure_computing(const struct seccomp_data *sd); +extern int __secure_computing(void); #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER static inline int secure_computing(void) { if (unlikely(test_syscall_work(SECCOMP))) - return __secure_computing(NULL); + return __secure_computing(); return 0; } #else @@ -54,7 +54,7 @@ static inline int secure_computing(void) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif -static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } +static inline int __secure_computing(void) { return 0; } static inline long prctl_get_seccomp(void) { -- cgit v1.2.3 From 4fe7fd17fe66a5e243c1de7ff32c29fac7039b8d Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 7 Feb 2025 14:09:05 -0600 Subject: iio: buffer-dmaengine: split requesting DMA channel from allocating buffer Refactor the IIO dmaengine buffer code to split requesting the DMA channel from allocating the buffer. We want to be able to add a new function where the IIO device driver manages the DMA channel, so these two actions need to be separate. To do this, calling dma_request_chan() is moved from iio_dmaengine_buffer_alloc() to iio_dmaengine_buffer_setup_ext(). A new __iio_dmaengine_buffer_setup_ext() helper function is added to simplify error unwinding and will also be used by a new function in a later patch. iio_dmaengine_buffer_free() now only frees the buffer and does not release the DMA channel. A new iio_dmaengine_buffer_teardown() function is added to unwind everything done in iio_dmaengine_buffer_setup_ext(). This keeps things more symmetrical with obvious pairs alloc/free and setup/teardown. Calling dma_get_slave_caps() in iio_dmaengine_buffer_alloc() is moved so that we can avoid any gotos for error unwinding. Reviewed-by: Nuno Sa Signed-off-by: David Lechner Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-8-e48a489be48c@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h index 81d9a19aeb91..72a2e3fd8a5b 100644 --- a/include/linux/iio/buffer-dmaengine.h +++ b/include/linux/iio/buffer-dmaengine.h @@ -12,7 +12,7 @@ struct iio_dev; struct device; -void iio_dmaengine_buffer_free(struct iio_buffer *buffer); +void iio_dmaengine_buffer_teardown(struct iio_buffer *buffer); struct iio_buffer *iio_dmaengine_buffer_setup_ext(struct device *dev, struct iio_dev *indio_dev, const char *channel, -- cgit v1.2.3 From 79f24971b4ffbdba9733365e298982c45338e6b1 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 7 Feb 2025 14:09:06 -0600 Subject: iio: buffer-dmaengine: add devm_iio_dmaengine_buffer_setup_with_handle() Add a new devm_iio_dmaengine_buffer_setup_with_handle() function to handle cases where the DMA channel is managed by the caller rather than being requested and released by the iio_dmaengine module. Reviewed-by: Nuno Sa Signed-off-by: David Lechner Link: https://patch.msgid.link/20250207-dlech-mainline-spi-engine-offload-2-v8-9-e48a489be48c@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dmaengine.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h index 72a2e3fd8a5b..37f27545f69f 100644 --- a/include/linux/iio/buffer-dmaengine.h +++ b/include/linux/iio/buffer-dmaengine.h @@ -11,6 +11,7 @@ struct iio_dev; struct device; +struct dma_chan; void iio_dmaengine_buffer_teardown(struct iio_buffer *buffer); struct iio_buffer *iio_dmaengine_buffer_setup_ext(struct device *dev, @@ -26,6 +27,10 @@ int devm_iio_dmaengine_buffer_setup_ext(struct device *dev, struct iio_dev *indio_dev, const char *channel, enum iio_buffer_direction dir); +int devm_iio_dmaengine_buffer_setup_with_handle(struct device *dev, + struct iio_dev *indio_dev, + struct dma_chan *chan, + enum iio_buffer_direction dir); #define devm_iio_dmaengine_buffer_setup(dev, indio_dev, channel) \ devm_iio_dmaengine_buffer_setup_ext(dev, indio_dev, channel, \ -- cgit v1.2.3 From 2ea97b76d6712bfb0408e5b81ffd7bc4551d3153 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2025 22:16:09 +0100 Subject: hrtimers: Make hrtimer_update_function() less expensive The sanity checks in hrtimer_update_function() are expensive for high frequency usage like in the io/uring code due to locking. Hide the sanity checks behind CONFIG_PROVE_LOCKING, which has a decent chance to be enabled on a regular basis for testing. Fixes: 8f02e3563bb5 ("hrtimers: Introduce hrtimer_update_function()") Reported-by: Jens Axboe Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/87ikpllali.ffs@tglx --- include/linux/hrtimer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f7bfdcf0dda3..bdd55c1f0d73 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -333,6 +333,7 @@ static inline int hrtimer_callback_running(struct hrtimer *timer) static inline void hrtimer_update_function(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *)) { +#ifdef CONFIG_PROVE_LOCKING guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock); if (WARN_ON_ONCE(hrtimer_is_queued(timer))) @@ -340,7 +341,7 @@ static inline void hrtimer_update_function(struct hrtimer *timer, if (WARN_ON_ONCE(!function)) return; - +#endif timer->function = function; } -- cgit v1.2.3 From 0de47f28ec8444ead678f96e729105ca4bcc6db2 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 15 Jan 2025 14:48:01 +0900 Subject: crash: Use note name macros Use note name macros to match with the userspace's expectation. Signed-off-by: Akihiko Odaki Acked-by: Baoquan He Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20250115-elf-v5-4-0f9e55bbb2fc@daynix.com Signed-off-by: Kees Cook --- include/linux/vmcore_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h index e1dec1a6a749..1672801fd98c 100644 --- a/include/linux/vmcore_info.h +++ b/include/linux/vmcore_info.h @@ -8,7 +8,7 @@ #define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4) #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) /* -- cgit v1.2.3 From 7e620b56d958a5efcb0158c366581e0637fd9a50 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Wed, 15 Jan 2025 14:48:03 +0900 Subject: crash: Remove KEXEC_CORE_NOTE_NAME KEXEC_CORE_NOTE_NAME is no longer used. Signed-off-by: Akihiko Odaki Acked-by: Baoquan He Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20250115-elf-v5-6-0f9e55bbb2fc@daynix.com Signed-off-by: Kees Cook --- include/linux/kexec.h | 2 -- include/linux/vmcore_info.h | 1 - 2 files changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0e9f8eda7a3..c840431eadda 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -68,8 +68,6 @@ extern note_buf_t __percpu *crash_notes; #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE #endif -#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME - /* * This structure is used to hold the arguments that are used when loading * kernel binaries. diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h index 1672801fd98c..37e003ae5262 100644 --- a/include/linux/vmcore_info.h +++ b/include/linux/vmcore_info.h @@ -6,7 +6,6 @@ #include #include -#define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4) #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) -- cgit v1.2.3 From c6594d64271704b335378e7b74c39fe4d4fcc777 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 6 Feb 2025 19:26:26 +0100 Subject: unroll: add generic loop unroll helpers There are cases when we need to explicitly unroll loops. For example, cache operations, filling DMA descriptors on very high speeds etc. Add compiler-specific attribute macros to give the compiler a hint that we'd like to unroll a loop. Example usage: #define UNROLL_BATCH 8 unrolled_count(UNROLL_BATCH) for (u32 i = 0; i < UNROLL_BATCH; i++) op(priv, i); Note that sometimes the compilers won't unroll loops if they think this would have worse optimization and perf than without unrolling, and that unroll attributes are available only starting GCC 8. For older compiler versions, no hints/attributes will be applied. For better unrolling/parallelization, don't have any variables that interfere between iterations except for the iterator itself. Co-developed-by: Jose E. Marchesi # pragmas Signed-off-by: Jose E. Marchesi Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20250206182630.3914318-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/unroll.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/unroll.h b/include/linux/unroll.h index d42fd6366373..863fb69f6a7e 100644 --- a/include/linux/unroll.h +++ b/include/linux/unroll.h @@ -9,6 +9,50 @@ #include +#ifdef CONFIG_CC_IS_CLANG +#define __pick_unrolled(x, y) _Pragma(#x) +#elif CONFIG_GCC_VERSION >= 80000 +#define __pick_unrolled(x, y) _Pragma(#y) +#else +#define __pick_unrolled(x, y) /* not supported */ +#endif + +/** + * unrolled - loop attributes to ask the compiler to unroll it + * + * Usage: + * + * #define BATCH 8 + * + * unrolled_count(BATCH) + * for (u32 i = 0; i < BATCH; i++) + * // loop body without cross-iteration dependencies + * + * This is only a hint and the compiler is free to disable unrolling if it + * thinks the count is suboptimal and may hurt performance and/or hugely + * increase object code size. + * Not having any cross-iteration dependencies (i.e. when iter x + 1 depends + * on what iter x will do with variables) is not a strict requirement, but + * provides best performance and object code size. + * Available only on Clang and GCC 8.x onwards. + */ + +/* Ask the compiler to pick an optimal unroll count, Clang only */ +#define unrolled \ + __pick_unrolled(clang loop unroll(enable), /* nothing */) + +/* Unroll each @n iterations of the loop */ +#define unrolled_count(n) \ + __pick_unrolled(clang loop unroll_count(n), GCC unroll n) + +/* Unroll the whole loop */ +#define unrolled_full \ + __pick_unrolled(clang loop unroll(full), GCC unroll 65534) + +/* Never unroll the loop */ +#define unrolled_none \ + __pick_unrolled(clang loop unroll(disable), GCC unroll 1) + #define UNROLL(N, MACRO, args...) CONCATENATE(__UNROLL_, N)(MACRO, args) #define __UNROLL_0(MACRO, args...) -- cgit v1.2.3 From 848b09d53d923b4caee5491f57a5c5b22d81febc Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Thu, 6 Feb 2025 23:40:33 +0100 Subject: r8152: add vendor/device ID pair for Dell Alienware AW1022z The Dell AW1022z is an RTL8156B based 2.5G Ethernet controller. Add the vendor and product ID values to the driver. This makes Ethernet work with the adapter. Signed-off-by: Aleksander Jan Bajkowski Link: https://patch.msgid.link/20250206224033.980115-1-olek2@wp.pl Signed-off-by: Jakub Kicinski --- include/linux/usb/r8152.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h index 33a4c146dc19..2ca60828f28b 100644 --- a/include/linux/usb/r8152.h +++ b/include/linux/usb/r8152.h @@ -30,6 +30,7 @@ #define VENDOR_ID_NVIDIA 0x0955 #define VENDOR_ID_TPLINK 0x2357 #define VENDOR_ID_DLINK 0x2001 +#define VENDOR_ID_DELL 0x413c #define VENDOR_ID_ASUS 0x0b05 #if IS_REACHABLE(CONFIG_USB_RTL8152) -- cgit v1.2.3 From de86c5f60839dc0d771711a848b4f55ad3f90844 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 5 Feb 2025 11:39:13 +0200 Subject: wifi: mac80211: Add support for EPCS configuration Add support for configuring EPCS state: - When EPCS is enabled, send an EPCS enable request action frame to the AP. When the AP replies with EPCS enable response, enable EPCS by applying the QoS parameters provided by the AP. Do so for all the valid MLD links. Once EPCS is enabled, support processing of unsolicited EPCS enable response frames. - When EPCS is disabled, send an EPCS teardown request to the AP and apply the QoS parameters as obtained from the last received beacons. Do so for all the valid links. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250205110958.7a90afd7e140.I3f602d65f5c1fd849d6c70b12307dda33aa91ccb@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 16741e542e81..8f35a3a5211c 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1543,6 +1543,10 @@ struct ieee80211_mgmt { u8 count; u8 variable[]; } __packed ml_reconf_resp; + struct { + u8 action_code; + u8 variable[]; + } __packed epcs; } u; } __packed action; DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */ @@ -5570,6 +5574,9 @@ static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data, fixed + prof->sta_info_len - 1 <= len; } +#define IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID 0x000f +#define IEEE80211_EPCS_ENA_RESP_BODY_LEN 3 + static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len) { const struct ieee80211_ttlm_elem *t2l = (const void *)data; -- cgit v1.2.3 From 282eeec9196fc6593540c7bf7479305a8384de32 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 5 Feb 2025 11:39:14 +0200 Subject: wifi: ieee80211: Add missing EHT MAC capabilities Add missing EHT MAC capabilities definitions. Signed-off-by: Ilan Peer Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250205110958.6c1643c345a1.I7405b9c35cb39ae97a52c3fbcc36b0bd81e495dc@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8f35a3a5211c..508d466de1cc 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3113,6 +3113,11 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454 2 #define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK 0x01 +#define IEEE80211_EHT_MAC_CAP1_EHT_TRS 0x02 +#define IEEE80211_EHT_MAC_CAP1_TXOP_RET 0x04 +#define IEEE80211_EHT_MAC_CAP1_TWO_BQRS 0x08 +#define IEEE80211_EHT_MAC_CAP1_EHT_LINK_ADAPT_MASK 0x30 +#define IEEE80211_EHT_MAC_CAP1_UNSOL_EPCS_PRIO_ACCESS 0x40 /* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */ #define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ 0x02 -- cgit v1.2.3 From 1db50b96b059ca8e5548cb3e0e38a888b325f96b Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sun, 9 Feb 2025 15:54:32 +0100 Subject: mtd: rawnand: qcom: finish converting register to FIELD_PREP With some research in some obscure old QSDK, it was possible to find the MASK of the last register there were still set with raw shift and convert them to FIELD_PREP API. This is only a cleanup and modernize the code a bit and doesn't make any behaviour change. Signed-off-by: Christian Marangi Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h index 4d9b736ff8b7..35e7ee0f7809 100644 --- a/include/linux/mtd/nand-qpic-common.h +++ b/include/linux/mtd/nand-qpic-common.h @@ -108,7 +108,7 @@ #define ECC_FORCE_CLK_OPEN BIT(30) /* NAND_DEV_CMD1 bits */ -#define READ_ADDR 0 +#define READ_ADDR_MASK GENMASK(7, 0) /* NAND_DEV_CMD_VLD bits */ #define READ_START_VLD BIT(0) @@ -119,6 +119,7 @@ /* NAND_EBI2_ECC_BUF_CFG bits */ #define NUM_STEPS 0 +#define NUM_STEPS_MASK GENMASK(9, 0) /* NAND_ERASED_CW_DETECT_CFG bits */ #define ERASED_CW_ECC_MASK 1 @@ -139,8 +140,11 @@ /* NAND_READ_LOCATION_n bits */ #define READ_LOCATION_OFFSET 0 +#define READ_LOCATION_OFFSET_MASK GENMASK(9, 0) #define READ_LOCATION_SIZE 16 +#define READ_LOCATION_SIZE_MASK GENMASK(25, 16) #define READ_LOCATION_LAST 31 +#define READ_LOCATION_LAST_MASK BIT(31) /* Version Mask */ #define NAND_VERSION_MAJOR_MASK 0xf0000000 -- cgit v1.2.3 From fcd7ace9a725ae034ff9f24cb94c9fe12a1f02da Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 10 Feb 2025 17:16:14 -0600 Subject: spi: offload: types: include linux/bits.h Add #include to linux/spi/offload/types.h since this file uses the BIT macro. Suggested-by: Andy Shevchenko Signed-off-by: David Lechner Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20250210-spi-offload-extra-headers-v1-1-0f3356362254@baylibre.com Signed-off-by: Mark Brown --- include/linux/spi/offload/types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/spi/offload/types.h b/include/linux/spi/offload/types.h index 86d0e8cb9495..6f7892347871 100644 --- a/include/linux/spi/offload/types.h +++ b/include/linux/spi/offload/types.h @@ -7,6 +7,7 @@ #ifndef __LINUX_SPI_OFFLOAD_TYPES_H #define __LINUX_SPI_OFFLOAD_TYPES_H +#include #include struct device; -- cgit v1.2.3 From 6edf3152bd4c2bc58e3705872642e282d8b3eeb9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 Feb 2025 21:44:50 +0200 Subject: pwm: lpss: Clarify the bypass member semantics in struct pwm_lpss_boardinfo Instead of an odd comment, cite the documentation, which says more clearly what's going on with the programming flow on some of the Intel SoCs. Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg --- include/linux/platform_data/x86/pwm-lpss.h | 33 +++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/pwm-lpss.h b/include/linux/platform_data/x86/pwm-lpss.h index 752c06b47cc8..f0349edb47f4 100644 --- a/include/linux/platform_data/x86/pwm-lpss.h +++ b/include/linux/platform_data/x86/pwm-lpss.h @@ -15,9 +15,36 @@ struct pwm_lpss_boardinfo { unsigned int npwm; unsigned long base_unit_bits; /* - * Some versions of the IP may stuck in the state machine if enable - * bit is not set, and hence update bit will show busy status till - * the reset. For the rest it may be otherwise. + * NOTE: + * Intel Broxton, Apollo Lake, and Gemini Lake have different programming flow. + * + * Initial Enable or First Activation + * 1. Program the base unit and on time divisor values. + * 2. Set the software update bit. + * 3. Poll in a loop on the PWMCTRL bit until software update bit is cleared.+ + * 4. Enable the PWM output by setting PWM Enable. + * 5. Repeat the above steps for the next PWM Module. + * + * Dynamic update while PWM is Enabled + * 1. Program the base unit and on-time divisor values. + * 2. Set the software update bit. + * 3. Repeat the above steps for the next PWM module. + * + * + After setting PWMCTRL register's SW update bit, hardware automatically + * deasserts the SW update bit after a brief delay. It was observed that + * setting of PWM enable is typically done via read-modify-write of the PWMCTRL + * register. If there is no/little delay between setting software update bit + * and setting enable bit via read-modify-write, it is possible that the read + * could return with software enable as 1. In that case, the last write to set + * enable to 1 could also set sw_update to 1. If this happens, sw_update gets + * stuck and the driver code can hang as it explicitly waits for sw_update bit + * to be 0 after setting the enable bit to 1. To avoid this race condition, + * SW should poll on the software update bit to make sure that it is 0 before + * doing the read-modify-write to set the enable bit to 1. + * + * Also, we noted that if sw_update bit was set in step #1 above then when it + * is set again in step #2, sw_update bit never gets cleared and the flow hangs. + * As such, we need to make sure that sw_update bit is 0 when doing step #1. */ bool bypass; /* -- cgit v1.2.3 From 36d03cb3277e29beedb87b8efb1e4da02b26e0c0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sat, 8 Feb 2025 17:04:15 +0800 Subject: block: introduce init_wait_func() There is already a macro DEFINE_WAIT_FUNC() to declare a wait_queue_entry with a specified waking function. But there is not a counterpart for initializing one wait_queue_entry with a specified waking function. So introducing init_wait_func() for this, which also could be used in iocost and rq-qos. Using default_wake_function() in rq_qos_wait() to wake up waiters, which could remove ->task field from rq_qos_wait_data. Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Muchun Song Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250208090416.38642-1-songmuchun@bytedance.com Signed-off-by: Jens Axboe --- include/linux/wait.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 6d90ad974408..2bdc8f47963b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1207,14 +1207,16 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) -#define init_wait(wait) \ +#define init_wait_func(wait, function) \ do { \ (wait)->private = current; \ - (wait)->func = autoremove_wake_function; \ + (wait)->func = function; \ INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) +#define init_wait(wait) init_wait_func(wait, autoremove_wake_function) + typedef int (*task_call_f)(struct task_struct *p, void *arg); extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); -- cgit v1.2.3 From 8eb0d381be31bfa01f768ad38a15af7ade805e69 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 10 Feb 2025 21:49:22 +0100 Subject: net: phy: rename eee_broken_modes to eee_disabled_modes This bitmap is used also if the MAC doesn't support an EEE mode. So the mode isn't necessarily broken in the PHY. Therefore rename the bitmap. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/6cd11422-dd67-4c87-a642-308de694af92@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 19f076a71f94..dbc7e7245881 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -611,7 +611,7 @@ struct macsec_ops; * @eee_cfg: User configuration of EEE * @lp_advertising: Current link partner advertised linkmodes * @host_interfaces: PHY interface modes supported by host - * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited + * @eee_disabled_modes: Energy efficient ethernet modes not to be advertised * @autoneg: Flag autoneg being used * @rate_matching: Current rate matching mode * @link: Current link state @@ -727,7 +727,7 @@ struct phy_device { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported_eee); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising_eee); /* Energy efficient ethernet modes which should be prohibited */ - __ETHTOOL_DECLARE_LINK_MODE_MASK(eee_broken_modes); + __ETHTOOL_DECLARE_LINK_MODE_MASK(eee_disabled_modes); bool enable_tx_lpi; bool eee_active; struct eee_config eee_cfg; @@ -1353,7 +1353,7 @@ int phy_speed_down_core(struct phy_device *phydev); */ static inline void phy_set_eee_broken(struct phy_device *phydev, u32 link_mode) { - linkmode_set_bit(link_mode, phydev->eee_broken_modes); + linkmode_set_bit(link_mode, phydev->eee_disabled_modes); } /** -- cgit v1.2.3 From 5e7a74b6a35782be83b433979e71df2636ab05f0 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 10 Feb 2025 21:50:10 +0100 Subject: net: phy: rename phy_set_eee_broken to phy_disable_eee_mode Consider that an EEE mode may not be broken but simply not supported by the MAC, and rename function phy_set_eee_broken(). Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/30deb630-3f6b-4ffb-a1e6-a9736021f43a@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index dbc7e7245881..29df4c602589 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1347,11 +1347,11 @@ void of_set_phy_timing_role(struct phy_device *phydev); int phy_speed_down_core(struct phy_device *phydev); /** - * phy_set_eee_broken - Mark an EEE mode as broken so that it isn't advertised. + * phy_disable_eee_mode - Don't advertise an EEE mode. * @phydev: The phy_device struct - * @link_mode: The broken EEE mode + * @link_mode: The EEE mode to be disabled */ -static inline void phy_set_eee_broken(struct phy_device *phydev, u32 link_mode) +static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode) { linkmode_set_bit(link_mode, phydev->eee_disabled_modes); } -- cgit v1.2.3 From 16d11fdaeb22715d8b55b08890173ffa2326baee Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 9 Feb 2025 13:12:44 +0100 Subject: net: phy: remove unused PHY_INIT_TIMEOUT and PHY_FORCE_TIMEOUT Both definitions are unused. Last users have been removed with: f3ba9d490d6e ("net: s6gmac: remove driver") 2bd229df5e2e ("net: phy: remove state PHY_FORCING") Signed-off-by: Heiner Kallweit Reviewed-by: Gerhard Engleder Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/f8e7b8ed-a665-41ad-b0ce-cbfdb65262ef@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 29df4c602589..83994b394d8e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -303,9 +303,6 @@ static inline long rgmii_clock(int speed) } } -#define PHY_INIT_TIMEOUT 100000 -#define PHY_FORCE_TIMEOUT 10 - #define PHY_MAX_ADDR 32 /* Used when trying to connect to a specific phy (mii bus id:phy device id) */ -- cgit v1.2.3 From 91931af18bd22437e08e2471f5484d6fbdd8ab93 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Mon, 10 Feb 2025 16:33:27 -0600 Subject: gpiolib: add gpiod_multi_set_value_cansleep() Add a new gpiod_multi_set_value_cansleep() helper function with fewer parameters than gpiod_set_array_value_cansleep(). Calling gpiod_set_array_value_cansleep() can get quite verbose. In many cases, the first arguments all come from the same struct gpio_descs, so having a separate function where we can just pass that cuts down on the boilerplate. Signed-off-by: David Lechner Reviewed-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250210-gpio-set-array-helper-v3-1-d6a673674da8@baylibre.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index db2dfbae8edb..5cbd4afd7862 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -3,6 +3,7 @@ #define __LINUX_GPIO_CONSUMER_H #include +#include #include struct acpi_device; @@ -655,4 +656,14 @@ static inline void gpiod_unexport(struct gpio_desc *desc) #endif /* CONFIG_GPIOLIB && CONFIG_GPIO_SYSFS */ +static inline int gpiod_multi_set_value_cansleep(struct gpio_descs *descs, + unsigned long *value_bitmap) +{ + if (IS_ERR_OR_NULL(descs)) + return PTR_ERR_OR_ZERO(descs); + + return gpiod_set_array_value_cansleep(descs->ndescs, descs->desc, + descs->info, value_bitmap); +} + #endif -- cgit v1.2.3 From 784ed4354c9077501b3a9236bafea23e5b878703 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Feb 2025 12:27:46 +0100 Subject: uidgid: add map_id_range_up() Add map_id_range_up() to verify that the full kernel id range can be mapped up in a given idmapping. This will be used in follow-up patches. Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-1-007720f39f2e@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/uidgid.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index f85ec5613721..2dc767e08f54 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -132,6 +132,7 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) u32 map_id_down(struct uid_gid_map *map, u32 id); u32 map_id_up(struct uid_gid_map *map, u32 id); +u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count); #else @@ -186,6 +187,11 @@ static inline u32 map_id_down(struct uid_gid_map *map, u32 id) return id; } +static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) +{ + return id; +} + static inline u32 map_id_up(struct uid_gid_map *map, u32 id) { return id; -- cgit v1.2.3 From 37c4a9590e1efcae7749682239fc22a330d2d325 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Feb 2025 12:27:47 +0100 Subject: statmount: allow to retrieve idmappings This adds the STATMOUNT_MNT_UIDMAP and STATMOUNT_MNT_GIDMAP options. It allows the retrieval of idmappings via statmount(). Currently it isn't possible to figure out what idmappings are applied to an idmapped mount. This information is often crucial. Before statmount() the only realistic options for an interface like this would have been to add it to /proc//fdinfo/ or to expose it in /proc//mountinfo. Both solution would have been pretty ugly and would've shown information that is of strong interest to some application but not all. statmount() is perfect for this. The idmappings applied to an idmapped mount are shown relative to the caller's user namespace. This is the most useful solution that doesn't risk leaking information or confuse the caller. For example, an idmapped mount might have been created with the following idmappings: mount --bind -o X-mount.idmap="0:10000:1000 2000:2000:1 3000:3000:1" /srv /opt Listing the idmappings through statmount() in the same context shows: mnt_id: 2147485088 mnt_parent_id: 2147484816 fs_type: btrfs mnt_root: /srv mnt_point: /opt mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/ mnt_uidmap[0]: 0 10000 1000 mnt_uidmap[1]: 2000 2000 1 mnt_uidmap[2]: 3000 3000 1 mnt_gidmap[0]: 0 10000 1000 mnt_gidmap[1]: 2000 2000 1 mnt_gidmap[2]: 3000 3000 1 But the idmappings might not always be resolvable in the caller's user namespace. For example: unshare --user --map-root In this case statmount() will skip any mappings that fil to resolve in the caller's idmapping: mnt_id: 2147485087 mnt_parent_id: 2147484016 fs_type: btrfs mnt_root: /srv mnt_point: /opt mnt_opts: ssd,discard=async,space_cache=v2,subvolid=5,subvol=/ The caller can differentiate between a mount not being idmapped and a mount that is idmapped but where all mappings fail to resolve in the caller's idmapping by check for the STATMOUNT_MNT_{G,U}IDMAP flag being raised but the number of mappings in ->mnt_{g,u}idmap_num being zero. Note that statmount() requires that the whole range must be resolvable in the caller's user namespace. If a subrange fails to map it will still list the map as not resolvable. This is a practical compromise to avoid having to find which subranges are resovable and wich aren't. Idmappings are listed as a string array with each mapping separated by zero bytes. This allows to retrieve the idmappings and immediately use them for writing to e.g., /proc//{g,u}id_map and it also allow for simple iteration like: if (stmnt->mask & STATMOUNT_MNT_UIDMAP) { const char *idmap = stmnt->str + stmnt->mnt_uidmap; for (size_t idx = 0; idx < stmnt->mnt_uidmap_nr; idx++) { printf("mnt_uidmap[%lu]: %s\n", idx, idmap); idmap += strlen(idmap) + 1; } } Link: https://lore.kernel.org/r/20250204-work-mnt_idmap-statmount-v2-2-007720f39f2e@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/mnt_idmapping.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index b1b219bc3422..e71a6070a8f8 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -25,6 +25,11 @@ static_assert(sizeof(vfsgid_t) == sizeof(kgid_t)); static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val)); static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val)); +static inline bool is_valid_mnt_idmap(const struct mnt_idmap *idmap) +{ + return idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap; +} + #ifdef CONFIG_MULTIUSER static inline uid_t __vfsuid_val(vfsuid_t uid) { -- cgit v1.2.3 From c4a16820d90199409c9bf01c4f794e1e9e8d8fd8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 Jan 2025 11:33:41 +0100 Subject: fs: add open_tree_attr() Add open_tree_attr() which allow to atomically create a detached mount tree and set mount options on it. If OPEN_TREE_CLONE is used this will allow the creation of a detached mount with a new set of mount options without it ever being exposed to userspace without that set of mount options applied. Link: https://lore.kernel.org/r/20250128-work-mnt_idmap-update-v2-v1-3-c25feb0d2eb3@kernel.org Reviewed-by: "Seth Forshee (DigitalOcean)" Signed-off-by: Christian Brauner --- include/linux/syscalls.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c6333204d451..079ea1d09d85 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -951,6 +951,10 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); +asmlinkage long sys_open_tree_attr(int dfd, const char __user *path, + unsigned flags, + struct mount_attr __user *uattr, + size_t usize); asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, int to_dfd, const char __user *to_path, unsigned int ms_flags); -- cgit v1.2.3 From 8893516000b247f91fa2cef34f2a77b609e661a4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 7 Feb 2025 17:07:34 +0200 Subject: gpiolib: Deduplicate some code in for_each_requested_gpio_in_range() Refactor for_each_requested_gpio_in_range() to deduplicate some code which is basically repeats the for_each_hwgpio(). In order to achieve this, split the latter to two, for_each_hwgpio_in_range() and for_each_hwgpio(). Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250207151149.2119765-2-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 2dd7cb9cc270..314f4241e306 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -550,20 +550,32 @@ DEFINE_CLASS(_gpiochip_for_each_data, const char **label, int *i) /** - * for_each_hwgpio - Iterates over all GPIOs for given chip. + * for_each_hwgpio_in_range - Iterates over all GPIOs in a given range * @_chip: Chip to iterate over. * @_i: Loop counter. + * @_base: First GPIO in the ranger. + * @_size: Amount of GPIOs to check starting from @base. * @_label: Place to store the address of the label if the GPIO is requested. * Set to NULL for unused GPIOs. */ -#define for_each_hwgpio(_chip, _i, _label) \ - for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i); \ - *_data.i < _chip->ngpio; \ - (*_data.i)++, kfree(*(_data.label)), *_data.label = NULL) \ - if (IS_ERR(*_data.label = \ - gpiochip_dup_line_label(_chip, *_data.i))) {} \ +#define for_each_hwgpio_in_range(_chip, _i, _base, _size, _label) \ + for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i); \ + *_data.i < _size; \ + (*_data.i)++, kfree(*(_data.label)), *_data.label = NULL) \ + if (IS_ERR(*_data.label = \ + gpiochip_dup_line_label(_chip, _base + *_data.i))) {} \ else +/** + * for_each_hwgpio - Iterates over all GPIOs for given chip. + * @_chip: Chip to iterate over. + * @_i: Loop counter. + * @_label: Place to store the address of the label if the GPIO is requested. + * Set to NULL for unused GPIOs. + */ +#define for_each_hwgpio(_chip, _i, _label) \ + for_each_hwgpio_in_range(_chip, _i, 0, _chip->ngpio, _label) + /** * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range * @_chip: the chip to query @@ -573,13 +585,8 @@ DEFINE_CLASS(_gpiochip_for_each_data, * @_label: label of current GPIO */ #define for_each_requested_gpio_in_range(_chip, _i, _base, _size, _label) \ - for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i); \ - *_data.i < _size; \ - (*_data.i)++, kfree(*(_data.label)), *_data.label = NULL) \ - if ((*_data.label = \ - gpiochip_dup_line_label(_chip, _base + *_data.i)) == NULL) {} \ - else if (IS_ERR(*_data.label)) {} \ - else + for_each_hwgpio_in_range(_chip, _i, _base, _size, _label) \ + if (_label == NULL) {} else /* Iterates over all requested GPIO of the given @chip */ #define for_each_requested_gpio(chip, i, label) \ -- cgit v1.2.3 From 767412f092fc6e04147305acd70f15770ece47ec Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 7 Feb 2025 17:07:35 +0200 Subject: gpiolib: Simplify implementation of for_each_hwgpio_in_range() The whole purpose of the custom CLASS() is to have possibility to initialise the counter variable _i to 0. This can't be done with simple __free() macro as it will be not allowed by C language. OTOH, the CLASS() operates with the pointers and explicit usage of the scoped variable _data is not needed, since the pointers are kept the same over the iterations. Simplify the implementation of for_each_hwgpio_in_range(). Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250207151149.2119765-3-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 314f4241e306..89439be2ddaa 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -560,11 +560,9 @@ DEFINE_CLASS(_gpiochip_for_each_data, */ #define for_each_hwgpio_in_range(_chip, _i, _base, _size, _label) \ for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i); \ - *_data.i < _size; \ - (*_data.i)++, kfree(*(_data.label)), *_data.label = NULL) \ - if (IS_ERR(*_data.label = \ - gpiochip_dup_line_label(_chip, _base + *_data.i))) {} \ - else + _i < _size; \ + _i++, kfree(_label), _label = NULL) \ + if (IS_ERR(_label = gpiochip_dup_line_label(_chip, _base + _i))) {} else /** * for_each_hwgpio - Iterates over all GPIOs for given chip. -- cgit v1.2.3 From d795a052b0ddad3da83dda6ff522c1b1aaa4a525 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Wed, 12 Feb 2025 11:33:12 -0600 Subject: spi: fix missing offload_flags doc Add offload_flags to the documentation comment for struct spi_transfer. This was missed when adding the field. Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250212154356.784944ea@canb.auug.org.au/ Fixes: 700a281905f2 ("spi: add offload TX/RX streaming APIs") Signed-off-by: David Lechner Link: https://patch.msgid.link/20250212-spi-offload-fixes-v1-1-e192c69e3bb3@baylibre.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 4c087009cf97..8d5c7da39c85 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -973,6 +973,8 @@ struct spi_res { * @rx_sg_mapped: If true, the @rx_sg is mapped for DMA * @tx_sg: Scatterlist for transmit, currently not for client use * @rx_sg: Scatterlist for receive, currently not for client use + * @offload_flags: Flags that are only applicable to specialized SPI offload + * transfers. See %SPI_OFFLOAD_XFER_* in spi-offload.h. * @ptp_sts_word_pre: The word (subject to bits_per_word semantics) offset * within @tx_buf for which the SPI device is requesting that the time * snapshot for this transfer begins. Upon completing the SPI transfer, -- cgit v1.2.3 From 8bf47e4d7b87cbd6a69541643d3fa4003c99d95f Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 10 Feb 2025 09:23:57 +0100 Subject: net: phy: Add support for driver-specific next update time Introduce the `phy_get_next_update_time` function to allow PHY drivers to dynamically determine the time (in jiffies) until the next state update event. This enables more flexible and adaptive polling intervals based on the link state or other conditions. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250210082358.200751-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 83994b394d8e..64982eba71d1 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1270,6 +1270,19 @@ struct phy_driver { */ int (*led_polarity_set)(struct phy_device *dev, int index, unsigned long modes); + + /** + * @get_next_update_time: Get the time until the next update event + * @dev: PHY device + * + * Callback to determine the time (in jiffies) until the next + * update event for the PHY state machine. Allows PHY drivers to + * dynamically adjust polling intervals based on link state or other + * conditions. + * + * Returns the time in jiffies until the next update event. + */ + unsigned int (*get_next_update_time)(struct phy_device *dev); }; #define to_phy_driver(d) container_of_const(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) -- cgit v1.2.3 From 2001d21592e5eb531d23950223eedad55c987db8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 10 Feb 2025 10:36:44 +0000 Subject: net: phylink: provide phylink_mac_implements_lpi() Provide a helper to determine whether the MAC operations structure implements the LPI operations, which will be used by both phylink and DSA. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1thR9g-003vX6-4s@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 898b00451bbf..0de78673172d 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -737,6 +737,18 @@ static inline int phylink_get_link_timer_ns(phy_interface_t interface) } } +/** + * phylink_mac_implements_lpi() - determine if MAC implements LPI ops + * @ops: phylink_mac_ops structure + * + * Returns true if the phylink MAC operations structure indicates that the + * LPI operations have been implemented, false otherwise. + */ +static inline bool phylink_mac_implements_lpi(const struct phylink_mac_ops *ops) +{ + return ops && ops->mac_disable_tx_lpi && ops->mac_enable_tx_lpi; +} + void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, unsigned int neg_mode, u16 bmsr, u16 lpa); void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, -- cgit v1.2.3 From a399af4e3b1ab2c5d83292d4487c4d18de551659 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Jan 2025 15:09:26 +0100 Subject: jbd2: Avoid long replay times due to high number or revoke blocks Some users are reporting journal replay takes a long time when there is excessive number of revoke blocks in the journal. Reported times are like: 1048576 records - 95 seconds 2097152 records - 580 seconds The problem is that hash chains in the revoke table gets excessively long in these cases. Fix the problem by sizing the revoke table appropriately before the revoke pass. Thanks to Alexey Zhuravlev for benchmarking the patch with large numbers of revoke blocks [1]. [1] https://lore.kernel.org/all/20250113183107.7bfef7b6@x390.bzzz77.ru Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20250121140925.17231-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 053444f7a1d0..82ef232935c0 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1621,6 +1621,8 @@ extern void jbd2_journal_destroy_revoke_record_cache(void); extern void jbd2_journal_destroy_revoke_table_cache(void); extern int __init jbd2_journal_init_revoke_record_cache(void); extern int __init jbd2_journal_init_revoke_table_cache(void); +struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size); +void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table); extern void jbd2_journal_destroy_revoke(journal_t *); extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); -- cgit v1.2.3 From cd3fa304ba5c93ce57b9b55b3cd893af2be96527 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 5 Feb 2025 14:15:52 +0800 Subject: pmdomain: core: Introduce dev_pm_genpd_rpm_always_on() For some usecases a consumer driver requires its device to remain power-on from the PM domain perspective during runtime. Using dev PM qos along with the genpd governors, doesn't work for this case as would potentially prevent the device from being runtime suspended too. To support these usecases, let's introduce dev_pm_genpd_rpm_always_on() to allow consumers drivers to dynamically control the behaviour in genpd for a device that is attached to it. Signed-off-by: Ulf Hansson Signed-off-by: Shawn Lin Acked-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/1738736156-119203-4-git-send-email-shawn.lin@rock-chips.com Signed-off-by: Ulf Hansson --- include/linux/pm_domain.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 1aab31370065..d56a78af4af1 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -261,6 +261,7 @@ struct generic_pm_domain_data { unsigned int rpm_pstate; unsigned int opp_token; bool hw_mode; + bool rpm_always_on; void *data; }; @@ -293,6 +294,7 @@ ktime_t dev_pm_genpd_get_next_hrtimer(struct device *dev); void dev_pm_genpd_synced_poweroff(struct device *dev); int dev_pm_genpd_set_hwmode(struct device *dev, bool enable); bool dev_pm_genpd_get_hwmode(struct device *dev); +int dev_pm_genpd_rpm_always_on(struct device *dev, bool on); extern struct dev_power_governor simple_qos_governor; extern struct dev_power_governor pm_domain_always_on_gov; @@ -376,6 +378,11 @@ static inline bool dev_pm_genpd_get_hwmode(struct device *dev) return false; } +static inline int dev_pm_genpd_rpm_always_on(struct device *dev, bool on) +{ + return -EOPNOTSUPP; +} + #define simple_qos_governor (*(struct dev_power_governor *)(NULL)) #define pm_domain_always_on_gov (*(struct dev_power_governor *)(NULL)) #endif -- cgit v1.2.3 From fbc54ae4f8d7cef3b11f70945bf8df8f952814b6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 13:45:05 +0200 Subject: i2c: Unexport i2c_of_match_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit i2c_of_match_device() is not used anymore outside of I²C framework, unexport it. Signed-off-by: Andy Shevchenko Reviewed-by: Vladimir Zapolskiy Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 2b2af24d2a43..997e80649889 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1029,10 +1029,6 @@ static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node return i2c_get_adapter_by_fwnode(of_fwnode_handle(node)); } -const struct of_device_id -*i2c_of_match_device(const struct of_device_id *matches, - struct i2c_client *client); - int of_i2c_get_board_info(struct device *dev, struct device_node *node, struct i2c_board_info *info); @@ -1053,13 +1049,6 @@ static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node return NULL; } -static inline const struct of_device_id -*i2c_of_match_device(const struct of_device_id *matches, - struct i2c_client *client) -{ - return NULL; -} - static inline int of_i2c_get_board_info(struct device *dev, struct device_node *node, struct i2c_board_info *info) -- cgit v1.2.3 From 2c0ae8ef1e5edfd0e42727fba4617694f3aac2eb Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Fri, 7 Feb 2025 12:28:38 +0530 Subject: soundwire: amd: add support for ACP7.0 & ACP7.1 platforms Add SoundWire support for ACP7.0 and ACP7.1 platforms. Signed-off-by: Vijendar Mukunda Link: https://lore.kernel.org/r/20250207065841.4718-4-Vijendar.Mukunda@amd.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_amd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index 799f8578137b..6b839987f14c 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -28,6 +28,8 @@ #define ACP_SDW1 1 #define AMD_SDW_MAX_MANAGER_COUNT 2 #define ACP63_PCI_REV_ID 0x63 +#define ACP70_PCI_REV_ID 0x70 +#define ACP71_PCI_REV_ID 0x71 struct acp_sdw_pdata { u16 instance; -- cgit v1.2.3 From 34dba73b231f2a46af88519d573052cc57a84952 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 11 Feb 2025 11:20:56 +0100 Subject: sctp: Remove commented out code Remove commented out code. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250211102057.587182-1-thorsten.blum@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/sctp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 836a7e200f39..812011d8b67e 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -222,7 +222,6 @@ struct sctp_datahdr { __be16 stream; __be16 ssn; __u32 ppid; - /* __u8 payload[]; */ }; struct sctp_data_chunk { -- cgit v1.2.3 From 9e28059d56649a7212d5b3f8751ec021154ba3dd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 7 Feb 2025 23:08:02 -0500 Subject: ext4: introduce linear search for dentries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch addresses an issue where some files in case-insensitive directories become inaccessible due to changes in how the kernel function, utf8_casefold(), generates case-folded strings from the commit 5c26d2f1d3f5 ("unicode: Don't special case ignorable code points"). There are good reasons why this change should be made; it's actually quite stupid that Unicode seems to think that the characters ❤ and ❤️ should be casefolded. Unfortimately because of the backwards compatibility issue, this commit was reverted in 231825b2e1ff. This problem is addressed by instituting a brute-force linear fallback if a lookup fails on case-folded directory, which does result in a performance hit when looking up files affected by the changing how thekernel treats ignorable Uniode characters, or when attempting to look up non-existent file names. So this fallback can be disabled by setting an encoding flag if in the future, the system administrator or the manufacturer of a mobile handset or tablet can be sure that there was no opportunity for a kernel to insert file names with incompatible encodings. Fixes: 5c26d2f1d3f5 ("unicode: Don't special case ignorable code points") Signed-off-by: Theodore Ts'o Reviewed-by: Gabriel Krisman Bertazi --- include/linux/fs.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c3b2f8a621f..aa4ec39202c3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1258,11 +1258,19 @@ extern int send_sigurg(struct file *file); #define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ -#define SB_ENC_STRICT_MODE_FL (1 << 0) +#define SB_ENC_STRICT_MODE_FL (1 << 0) +#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) #define sb_has_strict_encoding(sb) \ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) +#if IS_ENABLED(CONFIG_UNICODE) +#define sb_no_casefold_compat_fallback(sb) \ + (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) +#else +#define sb_no_casefold_compat_fallback(sb) (1) +#endif + /* * Umount options */ -- cgit v1.2.3 From 5e4304ff8cd9330690de73df7d047014dce191bd Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Fri, 17 Jan 2025 15:33:07 -0500 Subject: drivers/hv: introduce vmbus_channel_set_cpu() The core functionality in target_cpu_store() is also needed in a subsequent patch for automatically changing the CPU when taking a CPU offline. As such, factor out the body of target_cpu_store() into new function vmbus_channel_set_cpu() that can also be used elsewhere. No functional change is intended. Cc: Boqun Feng Cc: Michael Kelley Cc: Wei Liu Signed-off-by: Hamza Mahfooz Reviewed-by: Michael Kelley Tested-by: Michael Kelley Link: https://lore.kernel.org/r/20250117203309.192072-2-hamzamahfooz@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20250117203309.192072-2-hamzamahfooz@linux.microsoft.com> --- include/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 4179add2864b..7f4f8d8bdf43 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1661,6 +1661,7 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, const guid_t *shv_host_servie_id); int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp); void vmbus_set_event(struct vmbus_channel *channel); +int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu); /* Get the start of the ring buffer. */ static inline void * -- cgit v1.2.3 From 56467292794b800164df20c076c409ac548e56ec Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 30 Jan 2025 13:35:48 -0800 Subject: bpf: fs/xattr: Add BPF kfuncs to set and remove xattrs Add the following kfuncs to set and remove xattrs from BPF programs: bpf_set_dentry_xattr bpf_remove_dentry_xattr bpf_set_dentry_xattr_locked bpf_remove_dentry_xattr_locked The _locked version of these kfuncs are called from hooks where dentry->d_inode is already locked. Instead of requiring the user to know which version of the kfuncs to use, the verifier will pick the proper kfunc based on the calling hook. Signed-off-by: Song Liu Acked-by: Christian Brauner Reviewed-by: Matt Bobrowski Link: https://lore.kernel.org/r/20250130213549.3353349-5-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_lsm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index aefcd6564251..643809cc78c3 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -48,6 +48,11 @@ void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *range); +int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, + const struct bpf_dynptr *value_p, int flags); +int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str); +bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog); + #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) @@ -86,6 +91,19 @@ static inline int bpf_lsm_get_retval_range(const struct bpf_prog *prog, { return -EOPNOTSUPP; } +static inline int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, + const struct bpf_dynptr *value_p, int flags) +{ + return -EOPNOTSUPP; +} +static inline int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str) +{ + return -EOPNOTSUPP; +} +static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog) +{ + return false; +} #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ -- cgit v1.2.3 From dcba69711fff8af4370cb4c00460db0bf47c7093 Mon Sep 17 00:00:00 2001 From: Jameson Thies Date: Tue, 4 Feb 2025 02:45:58 +0000 Subject: platform/chrome: add PD_EVENT_INIT bit definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update cros_ec_commands.h to include a definition for PD_EVENT_INIT. On platforms supporting UCSI, this host event type is sent when the PPM initializes. Signed-off-by: Jameson Thies Reviewed-by: Benson Leung Acked-by: Tzung-Bi Shih Reviewed-by: Łukasz Bartosik Link: https://lore.kernel.org/r/20250204024600.4138776-2-jthies@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/cros_ec_commands.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index ecf290a0c98f..1f4e4f2b89bb 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -5046,6 +5046,7 @@ struct ec_response_pd_status { #define PD_EVENT_DATA_SWAP BIT(3) #define PD_EVENT_TYPEC BIT(4) #define PD_EVENT_PPM BIT(5) +#define PD_EVENT_INIT BIT(6) struct ec_response_host_event_status { uint32_t status; /* PD MCU host event status */ -- cgit v1.2.3 From c08b0f2c317223fa702616dfa77f10a92b7b34b2 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 29 Jan 2025 08:25:50 -0800 Subject: Revert "tty/serial: Add kgdb_nmi driver" This reverts commit 0c57dfcc6c1d037243c2f8fbf62eab3633326ec0. The functionality was supoosed to be used by a later patch in the series that never landed [1]. Drop it. NOTE: part of functionality was already reverted by commit 39d0be87438a ("serial: kgdb_nmi: Remove unused knock code"). Also note that this revert is not a clean revert given code changes that have happened in the meantime. It's obvious that nobody is using this code since the two exposed functions (kgdb_register_nmi_console() and kgdb_unregister_nmi_console()) are both no-ops if "arch_kgdb_ops.enable_nmi" is not defined. No architectures define it. [1] https://lore.kernel.org/lkml/1348522080-32629-9-git-send-email-anton.vorontsov@linaro.org/ Signed-off-by: Douglas Anderson Acked-by: Andy Shevchenko Link: URL [1] Link: https://lore.kernel.org/r/20250129082535.1.Ia095eac1ae357f87d23e7af2206741f5d40788f1@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/kgdb.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 51ef131e66b7..14739952698b 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -306,14 +306,6 @@ extern const struct kgdb_arch arch_kgdb_ops; extern unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs); -#ifdef CONFIG_SERIAL_KGDB_NMI -extern int kgdb_register_nmi_console(void); -extern int kgdb_unregister_nmi_console(void); -#else -static inline int kgdb_register_nmi_console(void) { return 0; } -static inline int kgdb_unregister_nmi_console(void) { return 0; } -#endif - extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops); extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops); extern struct kgdb_io *dbg_io_ops; -- cgit v1.2.3 From a029a219385cfdf9c43fb1ef9eb49d173773388f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 29 Jan 2025 08:25:52 -0800 Subject: Revert "kernel/debug: Mask KGDB NMI upon entry" This reverts commit 5a14fead07bcf4e0acc877a8d9e1d1f40a441153. No architectures ever implemented `enable_nmi` since the later patches in the series adding it never landed. It's been a long time. Drop it. NOTE: this is not a clean revert due to changes in the file in the meantime. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20250129082535.3.I2254953cd852f31f354456689d68b2d910de3fbe@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/kgdb.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 14739952698b..5eebbe7a3545 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -257,7 +257,6 @@ extern void kgdb_arch_late(void); * hardware breakpoints. * @correct_hw_break: Allow an architecture to specify how to correct the * hardware debug registers. - * @enable_nmi: Manage NMI-triggered entry to KGDB */ struct kgdb_arch { unsigned char gdb_bpt_instr[BREAK_INSTR_SIZE]; @@ -270,8 +269,6 @@ struct kgdb_arch { void (*disable_hw_break)(struct pt_regs *regs); void (*remove_all_hw_break)(void); void (*correct_hw_break)(void); - - void (*enable_nmi)(bool on); }; /** -- cgit v1.2.3 From 582077c94052bd69a544b3f9d7619c9c6a67c34b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Feb 2025 13:15:33 +0100 Subject: x86/cfi: Clean up linkage With the introduction of kCFI the addition of ENDBR to SYM_FUNC_START* no longer suffices to make the function indirectly callable. This now requires the use of SYM_TYPED_FUNC_START. As such, remove the implicit ENDBR from SYM_FUNC_START* and add some explicit annotations to fix things up again. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Link: https://lore.kernel.org/r/20250207122546.409116003@infradead.org --- include/linux/compiler.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 200fd3c5bc70..aa7f0a9f6085 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -212,6 +212,16 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ +#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +/* + * Force a reference to the external symbol so the compiler generates + * __kcfi_typid. + */ +#define KCFI_REFERENCE(sym) __ADDRESSABLE(sym) +#else +#define KCFI_REFERENCE(sym) +#endif + /** * offset_to_ptr - convert a relative memory offset to an absolute pointer * @off: the address of the 32-bit offset value -- cgit v1.2.3 From 93f16a1ab78ca56e3cd997d1ea54c214774781ac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Feb 2025 13:15:34 +0100 Subject: x86/boot: Mark start_secondary() with __noendbr The handoff between the boot stubs and start_secondary() are before IBT is enabled and is definitely not subject to kCFI. As such, suppress all that for this function. Notably when the ENDBR poison would become fatal (ud1 instead of nop) this will trigger a tripple fault because we haven't set up the IDT to handle #UD yet. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Link: https://lore.kernel.org/r/20250207122546.509520369@infradead.org --- include/linux/objtool.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index c722a921165b..3ca965a2ddc8 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -128,7 +128,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) -#define __ASM_ANNOTATE(label, type) +#define __ASM_ANNOTATE(label, type) "" #define ASM_ANNOTATE(type) #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 @@ -147,6 +147,8 @@ * these relocations will never be used for indirect calls. */ #define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) + /* * This should be used immediately before an indirect jump/call. It tells * objtool the subsequent indirect jump/call is vouched safe for retpoline -- cgit v1.2.3 From aa34b811650ce87a82d1bff027eeb15e2856f7bb Mon Sep 17 00:00:00 2001 From: James Houghton Date: Tue, 4 Feb 2025 00:40:29 +0000 Subject: KVM: Allow lockless walk of SPTEs when handing aging mmu_notifier event It is possible to correctly do aging without taking the KVM MMU lock, or while taking it for read; add a Kconfig to let architectures do so. Architectures that select KVM_MMU_LOCKLESS_AGING are responsible for correctness. Suggested-by: Yu Zhao Signed-off-by: James Houghton Reviewed-by: David Matlack Link: https://lore.kernel.org/r/20250204004038.1680123-3-jthoughton@google.com [sean: massage shortlog+changelog, fix Kconfig goof and shorten name] Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f34f4cfaa513..c28a6aa1f2ed 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -267,6 +267,7 @@ struct kvm_gfn_range { union kvm_mmu_notifier_arg arg; enum kvm_gfn_range_filter attr_filter; bool may_block; + bool lockless; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -- cgit v1.2.3 From 27ebd8bf9e4b53388cef2d9cdb2947bc456b0b33 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 6 Nov 2024 12:37:18 -0500 Subject: virtchnl: add support for enabling PTP on iAVF Add support for allowing a VF to enable PTP feature - Rx timestamps The new capability is gated by VIRTCHNL_VF_CAP_PTP, which must be set by the VF to request access to the new operations. In addition, the VIRTCHNL_OP_1588_PTP_CAPS command is used to determine the specific capabilities available to the VF. This support includes the following additional capabilities: * Rx timestamps enabled in the Rx queues (when using flexible advanced descriptors) * Read access to PHC time over virtchnl using VIRTCHNL_OP_1588_PTP_GET_TIME Extra space is reserved in most structures to allow for future extension (like set clock, Tx timestamps). Additional opcode numbers are reserved and space in the virtchnl_ptp_caps structure is specifically set aside for this. Additionally, each structure has some space reserved for future extensions to allow some flexibility. Signed-off-by: Jacob Keller Reviewed-by: Rahul Rameshbabu Reviewed-by: Simon Horman Reviewed-by: Alexander Lobakin Tested-by: Rafal Romanowski Signed-off-by: Mateusz Polchlopek Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 67 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 13a11f3c09b8..92866e449b21 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -154,7 +154,10 @@ enum virtchnl_ops { VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 = 55, VIRTCHNL_OP_ENABLE_VLAN_INSERTION_V2 = 56, VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 = 57, - /* opcode 57 - 65 are reserved */ + /* opcode 58 and 59 are reserved */ + VIRTCHNL_OP_1588_PTP_GET_CAPS = 60, + VIRTCHNL_OP_1588_PTP_GET_TIME = 61, + /* opcode 62 - 65 are reserved */ VIRTCHNL_OP_GET_QOS_CAPS = 66, /* opcode 68 through 111 are reserved */ VIRTCHNL_OP_CONFIG_QUEUE_BW = 112, @@ -270,6 +273,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF BIT(27) #define VIRTCHNL_VF_OFFLOAD_FDIR_PF BIT(28) #define VIRTCHNL_VF_OFFLOAD_QOS BIT(29) +#define VIRTCHNL_VF_CAP_PTP BIT(31) #define VF_BASE_MODE_OFFLOADS (VIRTCHNL_VF_OFFLOAD_L2 | \ VIRTCHNL_VF_OFFLOAD_VLAN | \ @@ -1425,6 +1429,61 @@ struct virtchnl_fdir_del { VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del); +#define VIRTCHNL_1588_PTP_CAP_RX_TSTAMP BIT(1) +#define VIRTCHNL_1588_PTP_CAP_READ_PHC BIT(2) + +/** + * struct virtchnl_ptp_caps - Defines the PTP caps available to the VF. + * @caps: On send, VF sets what capabilities it requests. On reply, PF + * indicates what has been enabled for this VF. The PF shall not set + * bits which were not requested by the VF. + * @rsvd: Reserved bits for future extension. + * + * Structure that defines the PTP capabilities available to the VF. The VF + * sends VIRTCHNL_OP_1588_PTP_GET_CAPS, and must fill in the ptp_caps field + * indicating what capabilities it is requesting. The PF will respond with the + * same message with the virtchnl_ptp_caps structure indicating what is + * enabled for the VF. + * + * VIRTCHNL_1588_PTP_CAP_RX_TSTAMP indicates that the VF receive queues have + * receive timestamps enabled in the flexible descriptors. Note that this + * requires a VF to also negotiate to enable advanced flexible descriptors in + * the receive path instead of the default legacy descriptor format. + * + * VIRTCHNL_1588_PTP_CAP_READ_PHC indicates that the VF may read the PHC time + * via the VIRTCHNL_OP_1588_PTP_GET_TIME command. + * + * Note that in the future, additional capability flags may be added which + * indicate additional extended support. All fields marked as reserved by this + * header will be set to zero. VF implementations should verify this to ensure + * that future extensions do not break compatibility. + */ +struct virtchnl_ptp_caps { + u32 caps; + u8 rsvd[44]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(48, virtchnl_ptp_caps); + +/** + * struct virtchnl_phc_time - Contains the 64bits of PHC clock time in ns. + * @time: PHC time in nanoseconds + * @rsvd: Reserved for future extension + * + * Structure received with VIRTCHNL_OP_1588_PTP_GET_TIME. Contains the 64bits + * of PHC clock time in nanoseconds. + * + * VIRTCHNL_OP_1588_PTP_GET_TIME may be sent to request the current time of + * the PHC. This op is available in case direct access via the PHC registers + * is not available. + */ +struct virtchnl_phc_time { + u64 time; + u8 rsvd[8]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_phc_time); + struct virtchnl_shaper_bw { /* Unit is Kbps */ u32 committed; @@ -1757,6 +1816,12 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, } } break; + case VIRTCHNL_OP_1588_PTP_GET_CAPS: + valid_len = sizeof(struct virtchnl_ptp_caps); + break; + case VIRTCHNL_OP_1588_PTP_GET_TIME: + valid_len = sizeof(struct virtchnl_phc_time); + break; /* These are always errors coming from the VF. */ case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: -- cgit v1.2.3 From 7c1178a9df583454fc76be2ca8a6f0bef6613fba Mon Sep 17 00:00:00 2001 From: Simei Su Date: Wed, 6 Nov 2024 12:37:19 -0500 Subject: ice: support Rx timestamp on flex descriptor To support Rx timestamp offload, VIRTCHNL_OP_1588_PTP_CAPS is sent by the VF to request PTP capability and responded by the PF what capability is enabled for that VF. Hardware captures timestamps which contain only 32 bits of nominal nanoseconds, as opposed to the 64bit timestamps that the stack expects. To convert 32b to 64b, we need a current PHC time. VIRTCHNL_OP_1588_PTP_GET_TIME is sent by the VF and responded by the PF with the current PHC time. Signed-off-by: Simei Su Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Co-developed-by: Mateusz Polchlopek Signed-off-by: Mateusz Polchlopek Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 92866e449b21..56baf97c44d0 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -313,6 +313,18 @@ struct virtchnl_txq_info { VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_txq_info); +/* virtchnl_rxq_info_flags - definition of bits in the flags field of the + * virtchnl_rxq_info structure. + * + * @VIRTCHNL_PTP_RX_TSTAMP: request to enable Rx timestamping + * + * Other flag bits are currently reserved and they may be extended in the + * future. + */ +enum virtchnl_rxq_info_flags { + VIRTCHNL_PTP_RX_TSTAMP = BIT(0), +}; + /* VIRTCHNL_OP_CONFIG_RX_QUEUE * VF sends this message to set up parameters for one RX queue. * External data buffer contains one instance of virtchnl_rxq_info. @@ -336,7 +348,8 @@ struct virtchnl_rxq_info { u32 max_pkt_size; u8 crc_disable; u8 rxdid; - u8 pad1[2]; + enum virtchnl_rxq_info_flags flags:8; /* see virtchnl_rxq_info_flags */ + u8 pad1; u64 dma_ring_addr; /* see enum virtchnl_rx_hsplit; deprecated with AVF 1.0 */ -- cgit v1.2.3 From 6a88c797ab4005cd5dd02575c1b3d1e7b53fe715 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 6 Nov 2024 12:37:20 -0500 Subject: virtchnl: add enumeration for the rxdid format Support for allowing VF to negotiate the descriptor format requires that the VF specify which descriptor format to use when requesting Rx queues. The VF is supposed to request the set of supported formats via the new VIRTCHNL_OP_GET_SUPPORTED_RXDIDS, and then set one of the supported formats in the rxdid field of the virtchnl_rxq_info structure. The virtchnl.h header does not provide an enumeration of the format values. The existing implementations in the PF directly use the values from the DDP package. Make the formats explicit by defining an enumeration of the RXDIDs. Provide an enumeration for the values as well as the bit positions as returned by the supported_rxdids data from the VIRTCHNL_OP_GET_SUPPORTED_RXDIDS. Signed-off-by: Jacob Keller Reviewed-by: Rahul Rameshbabu Reviewed-by: Simon Horman Reviewed-by: Alexander Lobakin Tested-by: Rafal Romanowski Signed-off-by: Mateusz Polchlopek Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 50 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 56baf97c44d0..bc10e6ffa50b 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -313,6 +313,48 @@ struct virtchnl_txq_info { VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_txq_info); +/* RX descriptor IDs (range from 0 to 63) */ +enum virtchnl_rx_desc_ids { + VIRTCHNL_RXDID_0_16B_BASE = 0, + VIRTCHNL_RXDID_1_32B_BASE = 1, + VIRTCHNL_RXDID_2_FLEX_SQ_NIC = 2, + VIRTCHNL_RXDID_3_FLEX_SQ_SW = 3, + VIRTCHNL_RXDID_4_FLEX_SQ_NIC_VEB = 4, + VIRTCHNL_RXDID_5_FLEX_SQ_NIC_ACL = 5, + VIRTCHNL_RXDID_6_FLEX_SQ_NIC_2 = 6, + VIRTCHNL_RXDID_7_HW_RSVD = 7, + /* 8 through 15 are reserved */ + VIRTCHNL_RXDID_16_COMMS_GENERIC = 16, + VIRTCHNL_RXDID_17_COMMS_AUX_VLAN = 17, + VIRTCHNL_RXDID_18_COMMS_AUX_IPV4 = 18, + VIRTCHNL_RXDID_19_COMMS_AUX_IPV6 = 19, + VIRTCHNL_RXDID_20_COMMS_AUX_FLOW = 20, + VIRTCHNL_RXDID_21_COMMS_AUX_TCP = 21, + /* 22 through 63 are reserved */ +}; + +#define VIRTCHNL_RXDID_BIT(x) BIT_ULL(VIRTCHNL_RXDID_##x) + +/* RX descriptor ID bitmasks */ +enum virtchnl_rx_desc_id_bitmasks { + VIRTCHNL_RXDID_0_16B_BASE_M = VIRTCHNL_RXDID_BIT(0_16B_BASE), + VIRTCHNL_RXDID_1_32B_BASE_M = VIRTCHNL_RXDID_BIT(1_32B_BASE), + VIRTCHNL_RXDID_2_FLEX_SQ_NIC_M = VIRTCHNL_RXDID_BIT(2_FLEX_SQ_NIC), + VIRTCHNL_RXDID_3_FLEX_SQ_SW_M = VIRTCHNL_RXDID_BIT(3_FLEX_SQ_SW), + VIRTCHNL_RXDID_4_FLEX_SQ_NIC_VEB_M = VIRTCHNL_RXDID_BIT(4_FLEX_SQ_NIC_VEB), + VIRTCHNL_RXDID_5_FLEX_SQ_NIC_ACL_M = VIRTCHNL_RXDID_BIT(5_FLEX_SQ_NIC_ACL), + VIRTCHNL_RXDID_6_FLEX_SQ_NIC_2_M = VIRTCHNL_RXDID_BIT(6_FLEX_SQ_NIC_2), + VIRTCHNL_RXDID_7_HW_RSVD_M = VIRTCHNL_RXDID_BIT(7_HW_RSVD), + /* 8 through 15 are reserved */ + VIRTCHNL_RXDID_16_COMMS_GENERIC_M = VIRTCHNL_RXDID_BIT(16_COMMS_GENERIC), + VIRTCHNL_RXDID_17_COMMS_AUX_VLAN_M = VIRTCHNL_RXDID_BIT(17_COMMS_AUX_VLAN), + VIRTCHNL_RXDID_18_COMMS_AUX_IPV4_M = VIRTCHNL_RXDID_BIT(18_COMMS_AUX_IPV4), + VIRTCHNL_RXDID_19_COMMS_AUX_IPV6_M = VIRTCHNL_RXDID_BIT(19_COMMS_AUX_IPV6), + VIRTCHNL_RXDID_20_COMMS_AUX_FLOW_M = VIRTCHNL_RXDID_BIT(20_COMMS_AUX_FLOW), + VIRTCHNL_RXDID_21_COMMS_AUX_TCP_M = VIRTCHNL_RXDID_BIT(21_COMMS_AUX_TCP), + /* 22 through 63 are reserved */ +}; + /* virtchnl_rxq_info_flags - definition of bits in the flags field of the * virtchnl_rxq_info structure. * @@ -347,7 +389,12 @@ struct virtchnl_rxq_info { u32 databuffer_size; u32 max_pkt_size; u8 crc_disable; - u8 rxdid; + /* see enum virtchnl_rx_desc_ids; + * only used when VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC is supported. Note + * that when the offload is not supported, the descriptor format aligns + * with VIRTCHNL_RXDID_1_32B_BASE. + */ + enum virtchnl_rx_desc_ids rxdid:8; enum virtchnl_rxq_info_flags flags:8; /* see virtchnl_rxq_info_flags */ u8 pad1; u64 dma_ring_addr; @@ -1050,6 +1097,7 @@ struct virtchnl_filter { VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); struct virtchnl_supported_rxdids { + /* see enum virtchnl_rx_desc_id_bitmasks */ u64 supported_rxdids; }; -- cgit v1.2.3 From 2a86e210f1a102614116e347efda59896f780417 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 6 Nov 2024 12:37:21 -0500 Subject: iavf: add support for negotiating flexible RXDID format Enable support for VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC, to enable the VF driver the ability to determine what Rx descriptor formats are available. This requires sending an additional message during initialization and reset, the VIRTCHNL_OP_GET_SUPPORTED_RXDIDS. This operation requests the supported Rx descriptor IDs available from the PF. This is treated the same way that VLAN V2 capabilities are handled. Add a new set of extended capability flags, used to process send and receipt of the VIRTCHNL_OP_GET_SUPPORTED_RXDIDS message. This ensures we finish negotiating for the supported descriptor formats prior to beginning configuration of receive queues. This change stores the supported format bitmap into the iavf_adapter structure. Additionally, if VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC is enabled by the PF, we need to make sure that the Rx queue configuration specifies the format. Signed-off-by: Jacob Keller Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Co-developed-by: Mateusz Polchlopek Signed-off-by: Mateusz Polchlopek Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index bc10e6ffa50b..4811b9a14604 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -1096,11 +1096,6 @@ struct virtchnl_filter { VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); -struct virtchnl_supported_rxdids { - /* see enum virtchnl_rx_desc_id_bitmasks */ - u64 supported_rxdids; -}; - /* VIRTCHNL_OP_EVENT * PF sends this message to inform the VF driver of events that may affect it. * No direct response is expected from the VF, though it may generate other -- cgit v1.2.3 From a045e40645dfa02a68c17ad8a3c92a8ef62375b0 Mon Sep 17 00:00:00 2001 From: Swathi K S Date: Thu, 13 Feb 2025 09:45:59 +0530 Subject: net: stmmac: refactor clock management in EQoS driver Refactor clock management in EQoS driver for code reuse and to avoid redundancy. This way, only minimal changes are required when a new platform is added. Suggested-by: Andrew Lunn Signed-off-by: Swathi K S Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250213041559.106111-1-swathi.ks@samsung.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c9878a612e53..24422ac4e417 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -254,6 +254,8 @@ struct plat_stmmacenet_data { struct clk *clk_ptp_ref; unsigned long clk_ptp_rate; unsigned long clk_ref_rate; + struct clk_bulk_data *clks; + int num_clks; unsigned int mult_fact_100ns; s32 ptp_max_adj; u32 cdc_error_adj; -- cgit v1.2.3 From e9f03a6a879bffea0ee51af87ac7a0c77716dda6 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 10 Feb 2025 10:53:39 +0000 Subject: net: phylink: add support for notifying PCS about EEE There are hooks in the stmmac driver into XPCS to control the EEE settings when LPI is configured at the MAC. This bypasses the layering. To allow this to be removed from the stmmac driver, add two new methods for PCS to inform them when the LPI/EEE enablement state changes at the MAC. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1thRQ3-003w6u-RH@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 0de78673172d..41ab85e591ad 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -477,6 +477,10 @@ struct phylink_pcs { * @pcs_an_restart: restart 802.3z BaseX autonegotiation. * @pcs_link_up: program the PCS for the resolved link configuration * (where necessary). + * @pcs_disable_eee: optional notification to PCS that EEE has been disabled + * at the MAC. + * @pcs_enable_eee: optional notification to PCS that EEE will be enabled at + * the MAC. * @pcs_pre_init: configure PCS components necessary for MAC hardware * initialization e.g. RX clock for stmmac. */ @@ -500,6 +504,8 @@ struct phylink_pcs_ops { void (*pcs_an_restart)(struct phylink_pcs *pcs); void (*pcs_link_up)(struct phylink_pcs *pcs, unsigned int neg_mode, phy_interface_t interface, int speed, int duplex); + void (*pcs_disable_eee)(struct phylink_pcs *pcs); + void (*pcs_enable_eee)(struct phylink_pcs *pcs); int (*pcs_pre_init)(struct phylink_pcs *pcs); }; @@ -625,6 +631,22 @@ void pcs_an_restart(struct phylink_pcs *pcs); void pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode, phy_interface_t interface, int speed, int duplex); +/** + * pcs_disable_eee() - Disable EEE at the PCS + * @pcs: a pointer to a &struct phylink_pcs + * + * Optional method informing the PCS that EEE has been disabled at the MAC. + */ +void pcs_disable_eee(struct phylink_pcs *pcs); + +/** + * pcs_enable_eee() - Enable EEE at the PCS + * @pcs: a pointer to a &struct phylink_pcs + * + * Optional method informing the PCS that EEE is about to be enabled at the MAC. + */ +void pcs_enable_eee(struct phylink_pcs *pcs); + /** * pcs_pre_init() - Configure PCS components necessary for MAC initialization * @pcs: a pointer to a &struct phylink_pcs. -- cgit v1.2.3 From 8c841486674a43df9db08210232987cf039b8942 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 10 Feb 2025 10:53:44 +0000 Subject: net: xpcs: add function to configure EEE clock multiplying factor Add a function to separate out the EEE clock multiplying factor. This will be called by the stmmac driver to configure this value. It would have been better had the driver used the CLK API to retrieve this clock, get its rate and calculate the appropriate multiplier, but that door has closed. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1thRQ8-003w70-VT@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/pcs/pcs-xpcs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 733f4ddd2ef1..749d40a9a086 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -52,6 +52,7 @@ struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs); int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); +void xpcs_config_eee_mult_fact(struct dw_xpcs *xpcs, u8 mult_fact); struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr); struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode); void xpcs_destroy(struct dw_xpcs *xpcs); -- cgit v1.2.3 From 55faeb89968aab5a32f0de93cd97fc1c4169d0f7 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 10 Feb 2025 10:54:05 +0000 Subject: net: xpcs: remove xpcs_config_eee() from global scope Make xpcs_config_eee() private to the XPCS driver, called only from the phylink pcs_disable_eee() and pcs_enable_eee() methods. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1thRQT-003w7O-Ec@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/pcs/pcs-xpcs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 749d40a9a086..e40f554ff717 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -50,8 +50,6 @@ struct dw_xpcs; struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs); int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); -int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, - int enable); void xpcs_config_eee_mult_fact(struct dw_xpcs *xpcs, u8 mult_fact); struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr); struct dw_xpcs *xpcs_create_fwnode(struct fwnode_handle *fwnode); -- cgit v1.2.3 From ea47e70e476ffb3fc8969c842d95609da24266b1 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 13 Feb 2025 22:48:11 +0100 Subject: net: phy: remove fixup-related definitions from phy.h which are not used outside phylib Certain fixup-related definitions aren't used outside phy_device.c. So make them private and remove them from phy.h. Signed-off-by: Heiner Kallweit Reviewed-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/ea6fde13-9183-4c7c-8434-6c0eb64fc72c@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 64982eba71d1..245578ed710e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1287,9 +1287,6 @@ struct phy_driver { #define to_phy_driver(d) container_of_const(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) -#define PHY_ANY_ID "MATCH ANY PHY" -#define PHY_ANY_UID 0xffffffff - #define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 0) #define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 4) #define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 10) @@ -1322,15 +1319,6 @@ static inline bool phydev_id_compare(struct phy_device *phydev, u32 id) return phy_id_compare(id, phydev->phy_id, phydev->drv->phy_id_mask); } -/* A Structure for boards to register fixups with the PHY Lib */ -struct phy_fixup { - struct list_head list; - char bus_id[MII_BUS_ID_SIZE + 3]; - u32 phy_uid; - u32 phy_uid_mask; - int (*run)(struct phy_device *phydev); -}; - const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); const char *phy_rate_matching_to_str(int rate_matching); @@ -2127,8 +2115,6 @@ s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev, void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); -int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask, - int (*run)(struct phy_device *)); int phy_register_fixup_for_id(const char *bus_id, int (*run)(struct phy_device *)); int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask, -- cgit v1.2.3 From d3a0e217f850a768851974a6efbd70f5673bb584 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 13 Feb 2025 22:49:19 +0100 Subject: net: phy: stop exporting feature arrays which aren't used outside phylib Stop exporting feature arrays which aren't used outside phylib. Signed-off-by: Heiner Kallweit Reviewed-by: Mateusz Polchlopek Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/01886672-4880-4ca8-b7b0-94d40f6e0ec5@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 245578ed710e..104c0b489991 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -54,11 +54,6 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap2_features) __ro_after_init; #define PHY_EEE_CAP2_FEATURES ((unsigned long *)&phy_eee_cap2_features) extern const int phy_basic_ports_array[3]; -extern const int phy_10_100_features_array[4]; -extern const int phy_basic_t1_features_array[3]; -extern const int phy_basic_t1s_p2mp_features_array[2]; -extern const int phy_gbit_features_array[2]; -extern const int phy_10gbit_features_array[1]; /* * Set phydev->irq to PHY_POLL if interrupts are not supported, -- cgit v1.2.3 From ef6249e37df5eac72bacdfe0a3000b08ae153146 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 13 Feb 2025 22:50:02 +0100 Subject: net: phy: stop exporting phy_queue_state_machine phy_queue_state_machine() isn't used outside phy.c, so stop exporting it. Signed-off-by: Heiner Kallweit Reviewed-by: Mateusz Polchlopek Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/16986d3d-7baf-4b02-a641-e2916d491264@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 104c0b489991..2ab83d24a573 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2071,7 +2071,6 @@ int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); void phy_error(struct phy_device *phydev); void phy_state_machine(struct work_struct *work); -void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies); void phy_trigger_machine(struct phy_device *phydev); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); -- cgit v1.2.3 From 6b2edfba74696e0defd181989c9effe911e6f54f Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 13 Feb 2025 22:51:53 +0100 Subject: net: phy: remove helper phy_is_internal Helper phy_is_internal() is just used in two places phylib-internally. So let's remove it from the API. Signed-off-by: Heiner Kallweit Reviewed-by: Mateusz Polchlopek Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/f3f35265-80a9-4ed7-ad78-ae22c21e288b@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2ab83d24a573..3665cdd610a3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1739,15 +1739,6 @@ static inline bool phy_is_default_hwtstamp(struct phy_device *phydev) return phy_has_hwtstamp(phydev) && phydev->default_timestamp; } -/** - * phy_is_internal - Convenience function for testing if a PHY is internal - * @phydev: the phy_device struct - */ -static inline bool phy_is_internal(struct phy_device *phydev) -{ - return phydev->is_internal; -} - /** * phy_on_sfp - Convenience function for testing if a PHY is on an SFP module * @phydev: the phy_device struct -- cgit v1.2.3 From de38503b74e28c47e28ed800d2a8d12c713b2c63 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 13 Feb 2025 17:54:19 +0000 Subject: net: remove phylink_pcs .neg_mode boolean As all PCS are using the neg_mode parameter rather than the legacy an_mode, remove the ability to use the legacy an_mode. We remove the tests in the phylink code, unconditionally passing the PCS neg_mode parameter to PCS methods, and remove setting the flag from drivers. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tidPn-0040hd-2R@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 41ab85e591ad..08df65f6867a 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -442,7 +442,6 @@ struct phylink_pcs_ops; * are supported by this PCS. * @ops: a pointer to the &struct phylink_pcs_ops structure * @phylink: pointer to &struct phylink_config - * @neg_mode: provide PCS neg mode via "mode" argument * @poll: poll the PCS for link changes * @rxc_always_on: The MAC driver requires the reference clock * to always be on. Standalone PCS drivers which @@ -459,7 +458,6 @@ struct phylink_pcs { DECLARE_PHY_INTERFACE_MASK(supported_interfaces); const struct phylink_pcs_ops *ops; struct phylink *phylink; - bool neg_mode; bool poll; bool rxc_always_on; }; -- cgit v1.2.3 From 4a6f18f28627e121bd1f74b5fcc9f945d6dbeb1e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 10 Feb 2025 09:45:05 -0800 Subject: net/mlx4_core: Avoid impossible mlx4_db_alloc() order value GCC can see that the value range for "order" is capped, but this leads it to consider that it might be negative, leading to a false positive warning (with GCC 15 with -Warray-bounds -fdiagnostics-details): ../drivers/net/ethernet/mellanox/mlx4/alloc.c:691:47: error: array subscript -1 is below array bounds of 'long unsigned int *[2]' [-Werror=array-bounds=] 691 | i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o); | ~~~~~~~~~~~^~~ 'mlx4_alloc_db_from_pgdir': events 1-2 691 | i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | | | (2) out of array bounds here | (1) when the condition is evaluated to true In file included from ../drivers/net/ethernet/mellanox/mlx4/mlx4.h:53, from ../drivers/net/ethernet/mellanox/mlx4/alloc.c:42: ../include/linux/mlx4/device.h:664:33: note: while referencing 'bits' 664 | unsigned long *bits[2]; | ^~~~ Switch the argument to unsigned int, which removes the compiler needing to consider negative values. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250210174504.work.075-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx4/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 87edb7a8173b..f016263e1fcf 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1135,7 +1135,7 @@ int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, struct mlx4_buf *buf); -int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order); +int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, unsigned int order); void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db); int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, -- cgit v1.2.3 From 633488947ef66b194377411322dc9e12aab79b65 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 13 Feb 2025 15:50:22 +0100 Subject: kernfs: Use RCU to access kernfs_node::parent. kernfs_rename_lock is used to obtain stable kernfs_node::{name|parent} pointer. This is a preparation to access kernfs_node::parent under RCU and ensure that the pointer remains stable under the RCU lifetime guarantees. For a complete path, as it is done in kernfs_path_from_node(), the kernfs_rename_lock is still required in order to obtain a stable parent relationship while computing the relevant node depth. This must not change while the nodes are inspected in order to build the path. If the kernfs user never moves the nodes (changes the parent) then the kernfs_rename_lock is not required and the RCU guarantees are sufficient. This "restriction" can be set with KERNFS_ROOT_INVARIANT_PARENT. Otherwise the lock is required. Rename kernfs_node::parent to kernfs_node::__parent to denote the RCU access and use RCU accessor while accessing the node. Make cgroup use KERNFS_ROOT_INVARIANT_PARENT since the parent here can not change. Acked-by: Tejun Heo Cc: Yonghong Song Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20250213145023.2820193-6-bigeasy@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 87c79d076d6d..5dda9a268e44 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -147,6 +147,11 @@ enum kernfs_root_flag { * Support user xattrs to be written to nodes rooted at this root. */ KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, + + /* + * Renames must not change the parent node. + */ + KERNFS_ROOT_INVARIANT_PARENT = 0x0010, }; /* type-specific structures for kernfs_node union members */ @@ -199,8 +204,8 @@ struct kernfs_node { * never moved to a different parent, it is safe to access the * parent directly. */ - struct kernfs_node *parent; const char *name; + struct kernfs_node __rcu *__parent; struct rb_node rb; @@ -416,6 +421,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); void kernfs_destroy_root(struct kernfs_root *root); +unsigned int kernfs_root_flags(struct kernfs_node *kn); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, @@ -514,6 +520,8 @@ kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, { return ERR_PTR(-ENOSYS); } static inline void kernfs_destroy_root(struct kernfs_root *root) { } +static inline unsigned int kernfs_root_flags(struct kernfs_node *kn) +{ return 0; } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, -- cgit v1.2.3 From 741c10b096bc4dd79cd9f215b6ef173bb953e75c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 13 Feb 2025 15:50:23 +0100 Subject: kernfs: Use RCU to access kernfs_node::name. Using RCU lifetime rules to access kernfs_node::name can avoid the trouble with kernfs_rename_lock in kernfs_name() and kernfs_path_from_node() if the fs was created with KERNFS_ROOT_INVARIANT_PARENT. This is usefull as it allows to implement kernfs_path_from_node() only with RCU protection and avoiding kernfs_rename_lock. The lock is only required if the __parent node can be changed and the function requires an unchanged hierarchy while it iterates from the node to its parent. The change is needed to allow the lookup of the node's path (kernfs_path_from_node()) from context which runs always with disabled preemption and or interrutps even on PREEMPT_RT. The problem is that kernfs_rename_lock becomes a sleeping lock on PREEMPT_RT. I went through all ::name users and added the required access for the lookup with a few extensions: - rdtgroup_pseudo_lock_create() drops all locks and then uses the name later on. resctrl supports rename with different parents. Here I made a temporal copy of the name while it is used outside of the lock. - kernfs_rename_ns() accepts NULL as new_parent. This simplifies sysfs_move_dir_ns() where it can set NULL in order to reuse the current name. - kernfs_rename_ns() is only using kernfs_rename_lock if the parents are different. All users use either kernfs_rwsem (for stable path view) or just RCU for the lookup. The ::name uses always RCU free. Use RCU lifetime guarantees to access kernfs_node::name. Suggested-by: Tejun Heo Acked-by: Tejun Heo Reported-by: syzbot+6ea37e2e6ffccf41a7e6@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/67251dc6.050a0220.529b6.015e.GAE@google.com/ Reported-by: Hillf Danton Closes: https://lore.kernel.org/20241102001224.2789-1-hdanton@sina.com Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20250213145023.2820193-7-bigeasy@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5dda9a268e44..b5a5f32fdfd1 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -204,8 +204,8 @@ struct kernfs_node { * never moved to a different parent, it is safe to access the * parent directly. */ - const char *name; struct kernfs_node __rcu *__parent; + const char __rcu *name; struct rb_node rb; @@ -400,7 +400,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); -int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, +int kernfs_path_from_node(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); -- cgit v1.2.3 From a88927b534ba18019b0440cf3d7f068407b5250c Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 13 Feb 2025 13:05:15 +0000 Subject: firmware: add Exynos ACPM protocol driver Alive Clock and Power Manager (ACPM) Message Protocol is defined for the purpose of communication between the ACPM firmware and masters (AP, AOC, ...). ACPM firmware operates on the Active Power Management (APM) module that handles overall power activities. ACPM and masters regard each other as independent hardware component and communicate with each other using mailbox messages and shared memory. This protocol driver provides the interface for all the client drivers making use of the features offered by the APM. Add ACPM protocol support. Signed-off-by: Tudor Ambarus Link: https://lore.kernel.org/r/20250213-gs101-acpm-v9-2-8b0281b93c8b@linaro.org Signed-off-by: Krzysztof Kozlowski --- .../linux/firmware/samsung/exynos-acpm-protocol.h | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 include/linux/firmware/samsung/exynos-acpm-protocol.h (limited to 'include/linux') diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h new file mode 100644 index 000000000000..76255b5d06b1 --- /dev/null +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2020 Samsung Electronics Co., Ltd. + * Copyright 2020 Google LLC. + * Copyright 2024 Linaro Ltd. + */ + +#ifndef __EXYNOS_ACPM_PROTOCOL_H +#define __EXYNOS_ACPM_PROTOCOL_H + +#include + +struct acpm_handle; + +struct acpm_pmic_ops { + int (*read_reg)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, + u8 *buf); + int (*bulk_read)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, + u8 count, u8 *buf); + int (*write_reg)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, + u8 value); + int (*bulk_write)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, + u8 count, const u8 *buf); + int (*update_reg)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, + u8 value, u8 mask); +}; + +struct acpm_ops { + struct acpm_pmic_ops pmic_ops; +}; + +/** + * struct acpm_handle - Reference to an initialized protocol instance + * @ops: + */ +struct acpm_handle { + struct acpm_ops ops; +}; + +struct device; + +const struct acpm_handle *devm_acpm_get_by_phandle(struct device *dev, + const char *property); +#endif /* __EXYNOS_ACPM_PROTOCOL_H */ -- cgit v1.2.3 From 4018ab42636cbea1bcf4fd69afc31d51cd905ba0 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Fri, 14 Feb 2025 15:19:47 +0200 Subject: iio: backend: add API for interface get Add backend support for obtaining the interface type used. Reviewed-by: Nuno Sa Reviewed-by: David Lechner Signed-off-by: Antoniu Miclaus Link: https://patch.msgid.link/20250214131955.31973-2-antoniu.miclaus@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 10be00f3b120..a0ea6c29d7ba 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -70,6 +70,12 @@ enum iio_backend_sample_trigger { IIO_BACKEND_SAMPLE_TRIGGER_MAX }; +enum iio_backend_interface_type { + IIO_BACKEND_INTERFACE_SERIAL_LVDS, + IIO_BACKEND_INTERFACE_SERIAL_CMOS, + IIO_BACKEND_INTERFACE_MAX +}; + /** * struct iio_backend_ops - operations structure for an iio_backend * @enable: Enable backend. @@ -88,6 +94,7 @@ enum iio_backend_sample_trigger { * @extend_chan_spec: Extend an IIO channel. * @ext_info_set: Extended info setter. * @ext_info_get: Extended info getter. + * @interface_type_get: Interface type. * @read_raw: Read a channel attribute from a backend device * @debugfs_print_chan_status: Print channel status into a buffer. * @debugfs_reg_access: Read or write register value of backend. @@ -128,6 +135,8 @@ struct iio_backend_ops { const char *buf, size_t len); int (*ext_info_get)(struct iio_backend *back, uintptr_t private, const struct iio_chan_spec *chan, char *buf); + int (*interface_type_get)(struct iio_backend *back, + enum iio_backend_interface_type *type); int (*read_raw)(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); @@ -186,6 +195,8 @@ ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private, const char *buf, size_t len); ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private, const struct iio_chan_spec *chan, char *buf); +int iio_backend_interface_type_get(struct iio_backend *back, + enum iio_backend_interface_type *type); int iio_backend_read_raw(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); -- cgit v1.2.3 From fc3fdb835eebb2ba88c6fdbf2c8b9eae1ceab106 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Fri, 14 Feb 2025 15:19:48 +0200 Subject: iio: backend: add support for data size set Add backend support for setting the data size used. This setting can be adjusted within the IP cores interfacing devices. Reviewed-by: Nuno Sa Reviewed-by: David Lechner Signed-off-by: Antoniu Miclaus Link: https://patch.msgid.link/20250214131955.31973-3-antoniu.miclaus@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index a0ea6c29d7ba..9ae861a21472 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -95,6 +95,7 @@ enum iio_backend_interface_type { * @ext_info_set: Extended info setter. * @ext_info_get: Extended info getter. * @interface_type_get: Interface type. + * @data_size_set: Data size. * @read_raw: Read a channel attribute from a backend device * @debugfs_print_chan_status: Print channel status into a buffer. * @debugfs_reg_access: Read or write register value of backend. @@ -137,6 +138,7 @@ struct iio_backend_ops { const struct iio_chan_spec *chan, char *buf); int (*interface_type_get)(struct iio_backend *back, enum iio_backend_interface_type *type); + int (*data_size_set)(struct iio_backend *back, unsigned int size); int (*read_raw)(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); @@ -197,6 +199,7 @@ ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private, const struct iio_chan_spec *chan, char *buf); int iio_backend_interface_type_get(struct iio_backend *back, enum iio_backend_interface_type *type); +int iio_backend_data_size_set(struct iio_backend *back, unsigned int size); int iio_backend_read_raw(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); -- cgit v1.2.3 From 22894e0be908791e3df011cdfac02589c2f340bd Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Fri, 14 Feb 2025 15:19:49 +0200 Subject: iio: backend: add API for oversampling Add backend support for setting oversampling ratio. Reviewed-by: David Lechner Signed-off-by: Antoniu Miclaus Link: https://patch.msgid.link/20250214131955.31973-4-antoniu.miclaus@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 9ae861a21472..e45b7dfbec35 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -96,6 +96,7 @@ enum iio_backend_interface_type { * @ext_info_get: Extended info getter. * @interface_type_get: Interface type. * @data_size_set: Data size. + * @oversampling_ratio_set: Set Oversampling ratio. * @read_raw: Read a channel attribute from a backend device * @debugfs_print_chan_status: Print channel status into a buffer. * @debugfs_reg_access: Read or write register value of backend. @@ -139,6 +140,8 @@ struct iio_backend_ops { int (*interface_type_get)(struct iio_backend *back, enum iio_backend_interface_type *type); int (*data_size_set)(struct iio_backend *back, unsigned int size); + int (*oversampling_ratio_set)(struct iio_backend *back, + unsigned int ratio); int (*read_raw)(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); @@ -200,6 +203,8 @@ ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private, int iio_backend_interface_type_get(struct iio_backend *back, enum iio_backend_interface_type *type); int iio_backend_data_size_set(struct iio_backend *back, unsigned int size); +int iio_backend_oversampling_ratio_set(struct iio_backend *back, + unsigned int ratio); int iio_backend_read_raw(struct iio_backend *back, struct iio_chan_spec const *chan, int *val, int *val2, long mask); -- cgit v1.2.3 From 7665054ee0dd0caba6f5f7bae48aa5f221218496 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 14 Feb 2025 20:40:00 +0100 Subject: nodemask: add nodes_copy() Nodemasks API misses the plain nodes_copy() which is required in this series. Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/nodemask.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 9fd7a0ce9c1a..41cf43c4e70f 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -191,6 +191,13 @@ static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *s bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } +#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES) +static __always_inline void __nodes_copy(nodemask_t *dstp, + const nodemask_t *srcp, unsigned int nbits) +{ + bitmap_copy(dstp->bits, srcp->bits, nbits); +} + #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static __always_inline void __nodes_complement(nodemask_t *dstp, -- cgit v1.2.3 From 14a8262f505bc4478c50e66309057bc0d0d4b62e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 14 Feb 2025 20:40:01 +0100 Subject: nodemask: numa: reorganize inclusion path Nodemasks now pull linux/numa.h for MAX_NUMNODES and NUMA_NO_NODE macros. This series makes numa.h depending on nodemasks, so we hit a circular dependency. Nodemasks library is highly employed by NUMA code, and it would be logical to resolve the circular dependency by making NUMA headers dependent nodemask.h. Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/nodemask.h | 1 - include/linux/nodemask_types.h | 11 ++++++++++- include/linux/numa.h | 10 +--------- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 41cf43c4e70f..f0ac0633366b 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -94,7 +94,6 @@ #include #include #include -#include #include extern nodemask_t _unused_nodemask_arg_; diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h index 6b28d97ea6ed..f850a48742f1 100644 --- a/include/linux/nodemask_types.h +++ b/include/linux/nodemask_types.h @@ -3,7 +3,16 @@ #define __LINUX_NODEMASK_TYPES_H #include -#include + +#ifdef CONFIG_NODES_SHIFT +#define NODES_SHIFT CONFIG_NODES_SHIFT +#else +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) + +#define NUMA_NO_NODE (-1) typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; diff --git a/include/linux/numa.h b/include/linux/numa.h index 3567e40329eb..31d8bf8a951a 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -3,16 +3,8 @@ #define _LINUX_NUMA_H #include #include +#include -#ifdef CONFIG_NODES_SHIFT -#define NODES_SHIFT CONFIG_NODES_SHIFT -#else -#define NODES_SHIFT 0 -#endif - -#define MAX_NUMNODES (1 << NODES_SHIFT) - -#define NUMA_NO_NODE (-1) #define NUMA_NO_MEMBLK (-1) static inline bool numa_valid_node(int nid) -- cgit v1.2.3 From 16d79f2a4f155b54e7580563c6c03832506b3b09 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 14 Feb 2025 20:40:02 +0100 Subject: mm/numa: Introduce nearest_node_nodemask() Introduce the new helper nearest_node_nodemask() to find the closest node in a specified nodemask from a given starting node. Returns MAX_NUMNODES if no node is found. Suggested-by: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Acked-by: Yury Norov [NVIDIA] Signed-off-by: Tejun Heo --- include/linux/numa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index 31d8bf8a951a..e6baaf6051bc 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -31,6 +31,8 @@ void __init alloc_offline_node_data(int nid); /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); +int nearest_node_nodemask(int node, nodemask_t *mask); + #ifndef memory_add_physaddr_to_nid int memory_add_physaddr_to_nid(u64 start); #endif @@ -47,6 +49,11 @@ static inline int numa_nearest_node(int node, unsigned int state) return NUMA_NO_NODE; } +static inline int nearest_node_nodemask(int node, nodemask_t *mask) +{ + return NUMA_NO_NODE; +} + static inline int memory_add_physaddr_to_nid(u64 start) { return 0; -- cgit v1.2.3 From f09177ca5f242a32368a2e9414dce4c90dc1d405 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 14 Feb 2025 20:40:03 +0100 Subject: sched/topology: Introduce for_each_node_numadist() iterator Introduce the new helper for_each_node_numadist() to iterate over node IDs in order of increasing NUMA distance from a given starting node. This iterator is somehow similar to for_each_numa_hop_mask(), but instead of providing a cpumask at each iteration, it provides a node ID. Example usage: nodemask_t unvisited = NODE_MASK_ALL; int node, start = cpu_to_node(smp_processor_id()); node = start; for_each_node_numadist(node, unvisited) pr_info("node (%d, %d) -> %d\n", start, node, node_distance(start, node)); On a system with equidistant nodes: $ numactl -H ... node distances: node 0 1 2 3 0: 10 20 20 20 1: 20 10 20 20 2: 20 20 10 20 3: 20 20 20 10 Output of the example above (on node 0): [ 7.367022] node (0, 0) -> 10 [ 7.367151] node (0, 1) -> 20 [ 7.367186] node (0, 2) -> 20 [ 7.367247] node (0, 3) -> 20 On a system with non-equidistant nodes (simulated using virtme-ng): $ numactl -H ... node distances: node 0 1 2 3 0: 10 51 31 41 1: 51 10 21 61 2: 31 21 10 11 3: 41 61 11 10 Output of the example above (on node 0): [ 8.953644] node (0, 0) -> 10 [ 8.953712] node (0, 2) -> 31 [ 8.953764] node (0, 3) -> 41 [ 8.953817] node (0, 1) -> 51 Suggested-by: Yury Norov [NVIDIA] Signed-off-by: Andrea Righi Acked-by: Yury Norov [NVIDIA] Signed-off-by: Tejun Heo --- include/linux/topology.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 52f5850730b3..a1815f4395ab 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -261,6 +261,36 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops) } #endif /* CONFIG_NUMA */ +/** + * for_each_node_numadist() - iterate over nodes in increasing distance + * order, starting from a given node + * @node: the iteration variable and the starting node. + * @unvisited: a nodemask to keep track of the unvisited nodes. + * + * This macro iterates over NUMA node IDs in increasing distance from the + * starting @node and yields MAX_NUMNODES when all the nodes have been + * visited. + * + * Note that by the time the loop completes, the @unvisited nodemask will + * be fully cleared, unless the loop exits early. + * + * The difference between for_each_node() and for_each_node_numadist() is + * that the former allows to iterate over nodes in numerical order, whereas + * the latter iterates over nodes in increasing order of distance. + * + * This complexity of this iterator is O(N^2), where N represents the + * number of nodes, as each iteration involves scanning all nodes to + * find the one with the shortest distance. + * + * Requires rcu_lock to be held. + */ +#define for_each_node_numadist(node, unvisited) \ + for (int __start = (node), \ + (node) = nearest_node_nodemask((__start), &(unvisited)); \ + (node) < MAX_NUMNODES; \ + node_clear((node), (unvisited)), \ + (node) = nearest_node_nodemask((__start), &(unvisited))) + /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. -- cgit v1.2.3 From 13ee854e7c04236a47a5beaacdcf51eb0bc7a8fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:46 +0000 Subject: io_uring/kbuf: remove legacy kbuf caching Remove all struct io_buffer caches. It makes it a fair bit simpler. Apart from from killing a bunch of lines and juggling between lists, __io_put_kbuf_list() doesn't need ->completion_lock locking now. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/18287217466ee2576ea0b1e72daccf7b22c7e856.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3def525a1da3..e2fef264ff8b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -360,7 +360,6 @@ struct io_ring_ctx { spinlock_t completion_lock; - struct list_head io_buffers_comp; struct list_head cq_overflow_list; struct hlist_head waitid_list; @@ -379,8 +378,6 @@ struct io_ring_ctx { unsigned int file_alloc_start; unsigned int file_alloc_end; - struct list_head io_buffers_cache; - /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; -- cgit v1.2.3 From bcf8a0293a019bb0c4aebafdebe9a1e7a923249a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sun, 16 Feb 2025 19:25:04 -0700 Subject: io_uring: introduce type alias for io_tw_state In preparation for changing how io_tw_state is passed, introduce a type alias io_tw_token_t for struct io_tw_state *. This allows for changing the representation in one place, without having to update the many functions that just forward their struct io_tw_state * argument. Also add a comment to struct io_tw_state to explain its purpose. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250217022511.1150145-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e2fef264ff8b..ea4694ee9d19 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -436,8 +436,15 @@ struct io_ring_ctx { struct io_mapped_region param_region; }; +/* + * Token indicating function is called in task work context: + * ctx->uring_lock is held and any completions generated will be flushed. + * ONLY core io_uring.c should instantiate this struct. + */ struct io_tw_state { }; +/* Alias to use in code that doesn't instantiate struct io_tw_state */ +typedef struct io_tw_state *io_tw_token_t; enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, @@ -563,7 +570,7 @@ enum { REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); struct io_task_work { struct llist_node node; -- cgit v1.2.3 From 94a4274bb6ebc5b4293559304d0f00928de0d8c0 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sun, 16 Feb 2025 19:25:05 -0700 Subject: io_uring: pass struct io_tw_state by value 8e5b3b89ecaf ("io_uring: remove struct io_tw_state::locked") removed the only field of io_tw_state but kept it as a task work callback argument to "forc[e] users not to invoke them carelessly out of a wrong context". Passing the struct io_tw_state * argument adds a few instructions to all callers that can't inline the functions and see the argument is unused. So pass struct io_tw_state by value instead. Since it's a 0-sized value, it can be passed without any instructions needed to initialize it. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250217022511.1150145-2-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index ea4694ee9d19..123e69368730 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -444,7 +444,7 @@ struct io_ring_ctx { struct io_tw_state { }; /* Alias to use in code that doesn't instantiate struct io_tw_state */ -typedef struct io_tw_state *io_tw_token_t; +typedef struct io_tw_state io_tw_token_t; enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, -- cgit v1.2.3 From 6f377873cb23905009759b7366b9fe85c2a6ff37 Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 14 Feb 2025 16:09:36 -0800 Subject: io_uring/zcrx: add interface queue and refill queue Add a new object called an interface queue (ifq) that represents a net rx queue that has been configured for zero copy. Each ifq is registered using a new registration opcode IORING_REGISTER_ZCRX_IFQ. The refill queue is allocated by the kernel and mapped by userspace using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main SQ/CQ. It is used by userspace to return buffers that it is done with, which will then be re-used by the netdev again. The main CQ ring is used to notify userspace of received data by using the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each entry contains the offset + len to the data. For now, each io_uring instance only has a single ifq. Reviewed-by: Jens Axboe Signed-off-by: David Wei Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20250215000947.789731-2-dw@davidwei.uk Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 123e69368730..c0fe8a00fe53 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,6 +40,8 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; +struct io_zcrx_ifq; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -382,6 +384,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; + struct io_zcrx_ifq *ifq; + u32 pers_next; struct xarray personalities; @@ -434,6 +438,8 @@ struct io_ring_ctx { struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; + /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ + struct io_mapped_region zcrx_region; }; /* -- cgit v1.2.3 From d795e38df4b7ebac1072bbf7d8a5500c1ea83332 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 9 Feb 2025 18:05:58 +0000 Subject: iio: core: Rework claim and release of direct mode to work with sparse. Initial thought was to do something similar to __cond_lock() do_iio_device_claim_direct_mode(iio_dev) ? : ({ __acquire(iio_dev); 0; }) + Appropriate static inline iio_device_release_direct_mode() However with that, sparse generates false positives. E.g. drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c:1811:17: warning: context imbalance in 'st_lsm6dsx_read_raw' - unexpected unlock So instead, this patch rethinks the return type and makes it more 'conditional lock like' (which is part of what is going on under the hood anyway) and return a boolean - true for successfully acquired, false for did not acquire. To allow a migration path given the rework is now non trivial, take a leaf out of the naming of the conditional guard we currently have for IIO device direct mode and drop the _mode postfix from the new functions giving iio_device_claim_direct() and iio_device_release_direct() Whilst the kernel supports __cond_acquires() upstream sparse does not yet do so. Hence rely on sparse expanding a static inline wrapper to explicitly see whether __acquire() is called. Note that even with the solution here, sparse sometimes gives false positives. However in the few cases seen they were complex code structures that benefited from simplification anyway. Reviewed-by: David Lechner Reviewed-by: Nuno Sa Link: https://patch.msgid.link/20250209180624.701140-2-jic23@kernel.org Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 56161e02f002..5ed03e36178f 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include /* IIO TODO LIST */ @@ -662,6 +663,31 @@ int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp); int iio_device_claim_direct_mode(struct iio_dev *indio_dev); void iio_device_release_direct_mode(struct iio_dev *indio_dev); +/* + * Helper functions that allow claim and release of direct mode + * in a fashion that doesn't generate many false positives from sparse. + * Note this must remain static inline in the header so that sparse + * can see the __acquire() marking. Revisit when sparse supports + * __cond_acquires() + */ +static inline bool iio_device_claim_direct(struct iio_dev *indio_dev) +{ + int ret = iio_device_claim_direct_mode(indio_dev); + + if (ret) + return false; + + __acquire(iio_dev); + + return true; +} + +static inline void iio_device_release_direct(struct iio_dev *indio_dev) +{ + iio_device_release_direct_mode(indio_dev); + __release(indio_dev); +} + /* * This autocleanup logic is normally used via * iio_device_claim_direct_scoped(). -- cgit v1.2.3 From 4c571885898c5c98934d086f2ab11b5e27e4f41f Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 9 Feb 2025 18:06:24 +0000 Subject: iio: Drop iio_device_claim_direct_scoped() and related infrastructure Scoped conditional automated cleanup turned out to be harder to work with than expected. Despite several attempts to find a better solution non have surfaced. As such rip it out of the IIO code. Reviewed-by: David Lechner Reviewed-by: Nuno Sa Link: https://patch.msgid.link/20250209180624.701140-28-jic23@kernel.org Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 5ed03e36178f..07a0e8132e88 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -9,7 +9,6 @@ #include #include -#include #include #include #include @@ -688,32 +687,6 @@ static inline void iio_device_release_direct(struct iio_dev *indio_dev) __release(indio_dev); } -/* - * This autocleanup logic is normally used via - * iio_device_claim_direct_scoped(). - */ -DEFINE_GUARD(iio_claim_direct, struct iio_dev *, iio_device_claim_direct_mode(_T), - iio_device_release_direct_mode(_T)) - -DEFINE_GUARD_COND(iio_claim_direct, _try, ({ - struct iio_dev *dev; - int d = iio_device_claim_direct_mode(_T); - - if (d < 0) - dev = NULL; - else - dev = _T; - dev; - })) - -/** - * iio_device_claim_direct_scoped() - Scoped call to iio_device_claim_direct. - * @fail: What to do on failure to claim device. - * @iio_dev: Pointer to the IIO devices structure - */ -#define iio_device_claim_direct_scoped(fail, iio_dev) \ - scoped_cond_guard(iio_claim_direct_try, fail, iio_dev) - int iio_device_claim_buffer_mode(struct iio_dev *indio_dev); void iio_device_release_buffer_mode(struct iio_dev *indio_dev); -- cgit v1.2.3 From b2108fc82a0acda34388bff3e3ee3544013b1623 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 20:24:00 +0200 Subject: drm: Move for_each_if() to util_macros.h for wider use Other subsystem(s) may want to reuse the for_each_if() macro. Move it to util_macros.h to make it globally available. Suggested-by: Bartosz Golaszewski Signed-off-by: Andy Shevchenko Acked-by: Jani Nikula Reviewed-by: Bartosz Golaszewski Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250213182527.3092371-2-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/util_macros.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 825487fb66fa..3b570b765b75 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -4,6 +4,21 @@ #include +/** + * for_each_if - helper for handling conditionals in various for_each macros + * @condition: The condition to check + * + * Typical use:: + * + * #define for_each_foo_bar(x, y) \' + * list_for_each_entry(x, y->list, head) \' + * for_each_if(x->something == SOMETHING) + * + * The for_each_if() macro makes the use of for_each_foo_bar() less error + * prone. + */ +#define for_each_if(condition) if (!(condition)) {} else + /** * find_closest - locate the closest element in a sorted array * @x: The reference value. -- cgit v1.2.3 From 23318614f8c146d440e57a69d7171dd8c0d249b7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 20:24:01 +0200 Subject: gpiolib: Switch to use for_each_if() helper The for_each_*() APIs that are conditional can be written shorter and less error prone with for_each_if() helper in use. Switch them to use this helper. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250213182527.3092371-3-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 89439be2ddaa..10544f4a03e5 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_GENERIC_MSI_IRQ #include @@ -562,7 +563,7 @@ DEFINE_CLASS(_gpiochip_for_each_data, for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i); \ _i < _size; \ _i++, kfree(_label), _label = NULL) \ - if (IS_ERR(_label = gpiochip_dup_line_label(_chip, _base + _i))) {} else + for_each_if(!IS_ERR(_label = gpiochip_dup_line_label(_chip, _base + _i))) /** * for_each_hwgpio - Iterates over all GPIOs for given chip. @@ -584,7 +585,7 @@ DEFINE_CLASS(_gpiochip_for_each_data, */ #define for_each_requested_gpio_in_range(_chip, _i, _base, _size, _label) \ for_each_hwgpio_in_range(_chip, _i, _base, _size, _label) \ - if (_label == NULL) {} else + for_each_if(_label) /* Iterates over all requested GPIO of the given @chip */ #define for_each_requested_gpio(chip, i, label) \ @@ -870,7 +871,7 @@ static inline void gpiochip_unlock_as_irq(struct gpio_chip *gc, #define for_each_gpiochip_node(dev, child) \ device_for_each_child_node(dev, child) \ - if (!fwnode_property_present(child, "gpio-controller")) {} else + for_each_if(fwnode_property_present(child, "gpio-controller")) static inline unsigned int gpiochip_node_count(struct device *dev) { -- cgit v1.2.3 From b7c9f32614f17c76ab26b8ee0109c868d7e054fe Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 17 Feb 2025 15:38:43 +0000 Subject: firmware: arm_ffa: Replace UUID buffer to standard UUID format Currently ffa_partition_info structure holds the UUID in the format compatible with the firmware interface. However, most of the functions in the FF-A core driver deals directly with uuid_t type. Replace UUID buffer to standard UUID format in the ffa_partition_info structure. Tested-by: Viresh Kumar Message-Id: <20250217-ffa_updates-v3-2-bd1d9de615e7@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 74169dd0f659..abd0208f0f74 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -238,7 +238,7 @@ struct ffa_partition_info { /* partition runs in the AArch64 execution state. */ #define FFA_PARTITION_AARCH64_EXEC BIT(8) u32 properties; - u32 uuid[4]; + uuid_t uuid; }; static inline -- cgit v1.2.3 From 8768972cbbeacd673f09dc1f8e198e03ccfa9f52 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 17 Feb 2025 15:38:44 +0000 Subject: firmware: arm_ffa: Align sync_send_receive{,2} function prototypes Currently ffa_sync_send_receive2() takes UUID as a separate parameter instead of using the one available in ffa_device structure. Change the prototype of ffa_sync_send_receive2() to align with the ffa_sync_send_receive() and use ffa_device->uuid. Tested-by: Viresh Kumar Message-Id: <20250217-ffa_updates-v3-3-bd1d9de615e7@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index abd0208f0f74..e8b8ae8b192a 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -439,7 +439,7 @@ struct ffa_msg_ops { int (*sync_send_receive)(struct ffa_device *dev, struct ffa_send_direct_data *data); int (*indirect_send)(struct ffa_device *dev, void *buf, size_t sz); - int (*sync_send_receive2)(struct ffa_device *dev, const uuid_t *uuid, + int (*sync_send_receive2)(struct ffa_device *dev, struct ffa_send_direct_data2 *data); }; -- cgit v1.2.3 From 46dcd68aaccac0812c12ec3f4e59c8963e2760ad Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 17 Feb 2025 15:38:49 +0000 Subject: firmware: arm_ffa: Unregister the FF-A devices when cleaning up the partitions Both the FF-A core and the bus were in a single module before the commit 18c250bd7ed0 ("firmware: arm_ffa: Split bus and driver into distinct modules"). The arm_ffa_bus_exit() takes care of unregistering all the FF-A devices. Now that there are 2 distinct modules, if the core driver is unloaded and reloaded, it will end up adding duplicate FF-A devices as the previously registered devices weren't unregistered when we cleaned up the modules. Fix the same by unregistering all the FF-A devices on the FF-A bus during the cleaning up of the partitions and hence the cleanup of the module. Fixes: 18c250bd7ed0 ("firmware: arm_ffa: Split bus and driver into distinct modules") Tested-by: Viresh Kumar Message-Id: <20250217-ffa_updates-v3-8-bd1d9de615e7@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index e8b8ae8b192a..ca2ad5b0ac43 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -176,6 +176,7 @@ void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); void ffa_driver_unregister(struct ffa_driver *driver); +void ffa_devices_unregister(void); bool ffa_device_is_valid(struct ffa_device *ffa_dev); #else @@ -188,6 +189,8 @@ ffa_device_register(const struct ffa_partition_info *part_info, static inline void ffa_device_unregister(struct ffa_device *dev) {} +static inline void ffa_devices_unregister(void) {} + static inline int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name) -- cgit v1.2.3 From 84968e32d301f80bd37fa8d4f370528b714bdc2f Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 17 Feb 2025 15:38:50 +0000 Subject: firmware: arm_ffa: Helper to check if a partition can receive REQUEST2 messages Add a helper that allows FF-A drivers to check if the partition can receive the direct requests via the FFA_MSG_SEND_DIRECT_REQ2 ABI. Tested-by: Viresh Kumar Message-Id: <20250217-ffa_updates-v3-9-bd1d9de615e7@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index ca2ad5b0ac43..761ea8fe3bb6 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -240,6 +240,10 @@ struct ffa_partition_info { #define FFA_PARTITION_NOTIFICATION_RECV BIT(3) /* partition runs in the AArch64 execution state. */ #define FFA_PARTITION_AARCH64_EXEC BIT(8) +/* partition supports receipt of direct request2 */ +#define FFA_PARTITION_DIRECT_REQ2_RECV BIT(9) +/* partition can send direct request2. */ +#define FFA_PARTITION_DIRECT_REQ2_SEND BIT(10) u32 properties; uuid_t uuid; }; @@ -259,6 +263,10 @@ bool ffa_partition_check_property(struct ffa_device *dev, u32 property) #define ffa_partition_supports_direct_recv(dev) \ ffa_partition_check_property(dev, FFA_PARTITION_DIRECT_RECV) +#define ffa_partition_supports_direct_req2_recv(dev) \ + (ffa_partition_check_property(dev, FFA_PARTITION_DIRECT_REQ2_RECV) && \ + !dev->mode_32bit) + /* For use with FFA_MSG_SEND_DIRECT_{REQ,RESP} which pass data via registers */ struct ffa_send_direct_data { unsigned long data0; /* w3/x3 */ -- cgit v1.2.3 From 910cc1acc9b42186db586c2fa9b48de34d651e7b Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 17 Feb 2025 15:38:51 +0000 Subject: firmware: arm_ffa: Add support for passing UUID in FFA_MSG_SEND2 FF-A v1.2 introduces UUID field in partition message header used in FFA_MSG_SEND2 to enable partitions/endpoints exposing multiple UUIDs. Add the support for passing UUID in FFA_MSG_SEND2. Tested-by: Viresh Kumar Message-Id: <20250217-ffa_updates-v3-10-bd1d9de615e7@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 761ea8fe3bb6..bd78deeff284 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -282,6 +282,7 @@ struct ffa_indirect_msg_hdr { u32 offset; u32 send_recv_id; u32 size; + uuid_t uuid; }; /* For use with FFA_MSG_SEND_DIRECT_{REQ,RESP}2 which pass data via registers */ -- cgit v1.2.3 From 9fac08d9d9855fbb49bf2342c964ad3d861bfafe Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 17 Feb 2025 15:38:52 +0000 Subject: firmware: arm_ffa: Upgrade FF-A version to v1.2 in the driver The basic and mandatory features of FF-A v1.2 are all supported now. The driver supported version can be bumped from v1.1 to v1.2 Tested-by: Viresh Kumar Message-Id: <20250217-ffa_updates-v3-11-bd1d9de615e7@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index bd78deeff284..4fcbdc70cbc9 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -112,6 +112,7 @@ FIELD_PREP(FFA_MINOR_VERSION_MASK, (minor))) #define FFA_VERSION_1_0 FFA_PACK_VERSION_INFO(1, 0) #define FFA_VERSION_1_1 FFA_PACK_VERSION_INFO(1, 1) +#define FFA_VERSION_1_2 FFA_PACK_VERSION_INFO(1, 2) /** * FF-A specification mentions explicitly about '4K pages'. This should -- cgit v1.2.3 From c10debfe7f028c11f7a501a0f8e937c9be9e5327 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 17 Feb 2025 15:38:57 +0000 Subject: firmware: arm_ffa: Add support for {un,}registration of framework notifications Framework notifications are doorbells that are rung by the partition managers to signal common events to an endpoint. These doorbells cannot be rung by an endpoint directly. A partition manager can signal a Framework notification in response to an FF-A ABI invocation by an endpoint. Two additional notify_ops interface is being added for any FF-A device/ driver to register and unregister for such a framework notifications. Tested-by: Viresh Kumar Message-Id: <20250217-ffa_updates-v3-16-bd1d9de615e7@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 4fcbdc70cbc9..5bded24dc24f 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -468,6 +468,7 @@ struct ffa_cpu_ops { typedef void (*ffa_sched_recv_cb)(u16 vcpu, bool is_per_vcpu, void *cb_data); typedef void (*ffa_notifier_cb)(int notify_id, void *cb_data); +typedef void (*ffa_fwk_notifier_cb)(int notify_id, void *cb_data, void *buf); struct ffa_notifier_ops { int (*sched_recv_cb_register)(struct ffa_device *dev, @@ -476,6 +477,10 @@ struct ffa_notifier_ops { int (*notify_request)(struct ffa_device *dev, bool per_vcpu, ffa_notifier_cb cb, void *cb_data, int notify_id); int (*notify_relinquish)(struct ffa_device *dev, int notify_id); + int (*fwk_notify_request)(struct ffa_device *dev, + ffa_fwk_notifier_cb cb, void *cb_data, + int notify_id); + int (*fwk_notify_relinquish)(struct ffa_device *dev, int notify_id); int (*notify_send)(struct ffa_device *dev, int notify_id, bool per_vcpu, u16 vcpu); }; -- cgit v1.2.3 From 38e480d4fcac2e191e2f24f381aa8957865c49b2 Mon Sep 17 00:00:00 2001 From: Beata Michalska Date: Fri, 31 Jan 2025 16:24:36 +0000 Subject: cpufreq: Allow arch_freq_get_on_cpu to return an error Allow arch_freq_get_on_cpu to return an error for cases when retrieving current CPU frequency is not possible, whether that being due to lack of required arch support or due to other circumstances when the current frequency cannot be determined at given point of time. Signed-off-by: Beata Michalska Reviewed-by: Prasanna Kumar T S M Acked-by: Viresh Kumar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20250131162439.3843071-2-beata.michalska@arm.com Signed-off-by: Catalin Marinas --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7fe0981a7e46..02fd4746231d 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1184,7 +1184,7 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ } #endif -extern unsigned int arch_freq_get_on_cpu(int cpu); +extern int arch_freq_get_on_cpu(int cpu); #ifndef arch_set_freq_scale static __always_inline -- cgit v1.2.3 From 961ee5aeea048aa292f28d61f3a96a48554e91af Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Fri, 14 Feb 2025 15:14:10 +0100 Subject: net: phy: Add helper for getting tx amplitude gain Add helper which returns the tx amplitude gain defined in device tree. Modifying it can be necessary to compensate losses on the PCB and connector, so the voltages measured on the RJ45 pins are conforming. Signed-off-by: Dimitri Fedrau Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250214-dp83822-tx-swing-v5-2-02ca72620599@liebherr.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 3665cdd610a3..70c632799082 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2097,6 +2097,10 @@ void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause); s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev, const int *delay_values, int size, bool is_rx); +int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, + enum ethtool_link_mode_bit_indices linkmode, + u32 *val); + void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); -- cgit v1.2.3 From 432051806f614ca512da401b80257b95b2a2241e Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 17 Feb 2025 11:06:36 -0800 Subject: bpf: Make every prog keep a copy of ctx_arg_info Currently, ctx_arg_info is read-only in the view of the verifier since it is shared among programs of the same attach type. Make each program have their own copy of ctx_arg_info so that we can use it to store program specific information. In the next patch where we support acquiring a referenced kptr through a struct_ops argument tagged with "__ref", ctx_arg_info->ref_obj_id will be used to store the unique reference object id of the argument. This avoids creating a requirement in the verifier that "__ref" tagged arguments must be the first set of references acquired [0]. [0] https://lore.kernel.org/bpf/20241220195619.2022866-2-amery.hung@gmail.com/ Signed-off-by: Amery Hung Acked-by: Eduard Zingerman Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250217190640.1748177-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f3f50e29d639..f4df39e8c735 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1507,7 +1507,7 @@ struct bpf_prog_aux { u32 max_rdonly_access; u32 max_rdwr_access; struct btf *attach_btf; - const struct bpf_ctx_arg_aux *ctx_arg_info; + struct bpf_ctx_arg_aux *ctx_arg_info; void __percpu *priv_stack_ptr; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; @@ -1945,6 +1945,9 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif +int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, + const struct bpf_ctx_arg_aux *info, u32 cnt); + #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype); @@ -2546,7 +2549,7 @@ struct bpf_iter__bpf_map_elem { int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); -bool bpf_iter_prog_supported(struct bpf_prog *prog); +int bpf_iter_prog_supported(struct bpf_prog *prog); const struct bpf_func_proto * bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog); -- cgit v1.2.3 From a687df2008f66669deec27f74d1793c8a537a4bd Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 17 Feb 2025 11:06:37 -0800 Subject: bpf: Support getting referenced kptr from struct_ops argument Allows struct_ops programs to acqurie referenced kptrs from arguments by directly reading the argument. The verifier will acquire a reference for struct_ops a argument tagged with "__ref" in the stub function in the beginning of the main program. The user will be able to access the referenced kptr directly by reading the context as long as it has not been released by the program. This new mechanism to acquire referenced kptr (compared to the existing "kfunc with KF_ACQUIRE") is introduced for ergonomic and semantic reasons. In the first use case, Qdisc_ops, an skb is passed to .enqueue in the first argument. This mechanism provides a natural way for users to get a referenced kptr in the .enqueue struct_ops programs and makes sure that a qdisc will always enqueue or drop the skb. Signed-off-by: Amery Hung Acked-by: Eduard Zingerman Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250217190640.1748177-3-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f4df39e8c735..15164787ce7f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -968,6 +968,7 @@ struct bpf_insn_access_aux { struct { struct btf *btf; u32 btf_id; + u32 ref_obj_id; }; }; struct bpf_verifier_log *log; /* for verbose logs */ @@ -1481,6 +1482,8 @@ struct bpf_ctx_arg_aux { enum bpf_reg_type reg_type; struct btf *btf; u32 btf_id; + u32 ref_obj_id; + bool refcounted; }; struct btf_mod_pair { -- cgit v1.2.3 From 95b0916118106054e1f3d5d7f8628ef3dc0b3c02 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 23 Jan 2025 14:07:44 -0500 Subject: percpu: Remove PER_CPU_FIRST_SECTION x86-64 was the last user. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250123190747.745588-13-brgerst@gmail.com --- include/linux/percpu-defs.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5b520fe86b60..40d34e032d5b 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -26,13 +26,11 @@ #define PER_CPU_SHARED_ALIGNED_SECTION "..shared_aligned" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" #endif -#define PER_CPU_FIRST_SECTION "..first" #else #define PER_CPU_SHARED_ALIGNED_SECTION "" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" -#define PER_CPU_FIRST_SECTION "" #endif @@ -114,16 +112,6 @@ #define DEFINE_PER_CPU(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "") -/* - * Declaration/definition used for per-CPU variables that must come first in - * the set of variables. - */ -#define DECLARE_PER_CPU_FIRST(type, name) \ - DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) - -#define DEFINE_PER_CPU_FIRST(type, name) \ - DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) - /* * Declaration/definition used for per-CPU variables that must be cacheline * aligned under SMP conditions so that, whilst a particular instance of the -- cgit v1.2.3 From 806e32248e22582715dbbb7664567b81b9d6928a Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:43:27 +0100 Subject: can: Switch to use hrtimer_setup() hrtimer_setup() takes the callback function pointer as argument and initializes the timer completely. Replace hrtimer_init() and the open coded initialization of hrtimer::function with the new setup mechanism. Most of this patch is generated by Coccinelle. Except for the TX thrtimer in bcm_tx_setup() because this timer is not used and the callback function is never set. For this particular case, set the callback to hrtimer_dummy_timeout() Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Reviewed-by: Marc Kleine-Budde Link: https://lore.kernel.org/all/a3a6be42c818722ad41758457408a32163bfd9a0.1738746872.git.namcao@linutronix.de --- include/linux/hrtimer.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f7bfdcf0dda3..acae379541c5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -223,6 +223,11 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) } #endif +static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) +{ + return HRTIMER_NORESTART; +} + /* Exported timer functions: */ /* Initialize timers: */ -- cgit v1.2.3 From c305a4e98378903da5322c598381ad1ce643f4b4 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 18 Feb 2025 10:56:24 +0100 Subject: x86: Move sysctls into arch/x86 Move the following sysctl tables into arch/x86/kernel/setup.c: panic_on_{unrecoverable_nmi,io_nmi} bootloader_{type,version} io_delay_type unknown_nmi_panic acpi_realmode_flags Variables moved from include/linux/ to arch/x86/include/asm/ because there is no longer need for them outside arch/x86/kernel: acpi_realmode_flags panic_on_{unrecoverable_nmi,io_nmi} Include in arch/s86/kernel/setup.h in order to bring in panic_on_{io_nmi,unrecovered_nmi}. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kerenel/sysctl.c. Signed-off-by: Joel Granados Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250218-jag-mv_ctltables-v1-8-cd3698ab8d29@kernel.org --- include/linux/acpi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4e495b29c640..a70e62d69dc7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -330,7 +330,6 @@ static inline bool acpi_sci_irq_valid(void) } extern int sbf_port; -extern unsigned long acpi_realmode_flags; int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); -- cgit v1.2.3 From dea69f2d1cc8d9ecdc72ba350d10a1e71940ef18 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 17 Feb 2025 11:39:21 +0100 Subject: gpiolib: move all includes to the top of gpio/consumer.h We have several conditional includes depending on !CONFIG_GPIOLIB. This is supposed to reduce compilation time with CONFIG_GPIOLIB=y but in practice there's no difference on modern machines. It makes adding new stubs that depend on more than just GPIOLIB harder so move them all to the top, unduplicate them and replace asm/ with preferred linux/ alternatives. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250217103922.151047-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 5cbd4afd7862..0dc49b5fca5c 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -3,7 +3,10 @@ #define __LINUX_GPIO_CONSUMER_H #include +#include #include +#include +#include #include struct acpi_device; @@ -184,11 +187,6 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, #else /* CONFIG_GPIOLIB */ -#include -#include - -#include - static inline int gpiod_count(struct device *dev, const char *con_id) { return 0; @@ -609,8 +607,6 @@ int devm_acpi_dev_add_driver_gpios(struct device *dev, #else /* CONFIG_GPIOLIB && CONFIG_ACPI */ -#include - static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev, const struct acpi_gpio_mapping *gpios) { @@ -636,8 +632,6 @@ void gpiod_unexport(struct gpio_desc *desc); #else /* CONFIG_GPIOLIB && CONFIG_GPIO_SYSFS */ -#include - static inline int gpiod_export(struct gpio_desc *desc, bool direction_may_change) { -- cgit v1.2.3 From 63cdf6241ac7edd94cb4cd9a8f1ba2c3c61ed219 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 17 Feb 2025 11:39:22 +0100 Subject: gpiolib: don't build HTE code with CONFIG_HTE disabled Hardware timestamping is only used on tegra186 platforms but we include the code and export the symbols everywhere. Shrink the binary a bit by compiling the relevant functions conditionally. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250217103922.151047-2-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 0dc49b5fca5c..0b2b56199c36 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -114,8 +114,6 @@ int gpiod_get_direction(struct gpio_desc *desc); int gpiod_direction_input(struct gpio_desc *desc); int gpiod_direction_output(struct gpio_desc *desc, int value); int gpiod_direction_output_raw(struct gpio_desc *desc, int value); -int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags); -int gpiod_disable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags); /* Value get/set from non-sleeping context */ int gpiod_get_value(const struct gpio_desc *desc); @@ -347,18 +345,6 @@ static inline int gpiod_direction_output_raw(struct gpio_desc *desc, int value) WARN_ON(desc); return -ENOSYS; } -static inline int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, - unsigned long flags) -{ - WARN_ON(desc); - return -ENOSYS; -} -static inline int gpiod_disable_hw_timestamp_ns(struct gpio_desc *desc, - unsigned long flags) -{ - WARN_ON(desc); - return -ENOSYS; -} static inline int gpiod_get_value(const struct gpio_desc *desc) { /* GPIO can never have been requested */ @@ -559,6 +545,28 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, #endif /* CONFIG_GPIOLIB */ +#if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_HTE) +int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags); +int gpiod_disable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags); +#else +static inline int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, + unsigned long flags) +{ + if (!IS_ENABLED(CONFIG_GPIOLIB)) + WARN_ON(desc); + + return -ENOSYS; +} +static inline int gpiod_disable_hw_timestamp_ns(struct gpio_desc *desc, + unsigned long flags) +{ + if (!IS_ENABLED(CONFIG_GPIOLIB)) + WARN_ON(desc); + + return -ENOSYS; +} +#endif /* CONFIG_GPIOLIB && CONFIG_HTE */ + static inline struct gpio_desc *devm_fwnode_gpiod_get(struct device *dev, struct fwnode_handle *fwnode, -- cgit v1.2.3 From f54af4af7bd282c0ca9eef3f3bc239ae2fd9a58a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 3 Feb 2025 14:19:19 +0200 Subject: bitmap: Align documentation between bitmap_gather() and bitmap_scatter() The bitmap_scatter() mistakenly refers to itself for detailed explanation about the relationships of two. Instead of simply fixing this, align text in both making a cross-reference. Fixes: de5f84338970 ("lib/bitmap: Introduce bitmap_scatter() and bitmap_gather() helpers") Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/bitmap.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 2026953e2c4e..595217b7a6e7 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -560,9 +560,9 @@ void bitmap_replace(unsigned long *dst, * ...0..11...0..10 * dst: 0000001100000010 * - * A relationship exists between bitmap_scatter() and bitmap_gather(). + * A relationship exists between bitmap_scatter() and bitmap_gather(). See + * bitmap_gather() for the bitmap gather detailed operations. TL;DR: * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation. - * See bitmap_scatter() for details related to this relationship. */ static __always_inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, @@ -608,7 +608,9 @@ void bitmap_scatter(unsigned long *dst, const unsigned long *src, * dst: 0000000000011010 * * A relationship exists between bitmap_gather() and bitmap_scatter(). See - * bitmap_scatter() for the bitmap scatter detailed operations. + * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR: + * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation. + * * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n). * The operation bitmap_gather(result, scattered, mask, n) leads to a result * equal or equivalent to src. -- cgit v1.2.3 From 9ffa4b35a62d9786ce418085f36af671df3aacaa Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 10 Feb 2025 21:51:06 -0500 Subject: cpumask: add for_each_{possible,online}_cpu_wrap The iterators are trivial extensions of for_each_cpu_wrap(). They are used in the following patches of the series to replace cpumask_next_wrap(). Signed-off-by: Yury Norov --- include/linux/cpumask.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 36a890d0dd57..e57fd41ca38e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1033,11 +1033,21 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) + +#define for_each_possible_cpu_wrap(cpu, start) \ + for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) +#define for_each_online_cpu_wrap(cpu, start) \ + for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) + +#define for_each_possible_cpu_wrap(cpu, start) \ + for_each_cpu_wrap((cpu), cpu_possible_mask, (start)) +#define for_each_online_cpu_wrap(cpu, start) \ + for_each_cpu_wrap((cpu), cpu_online_mask, (start)) #endif /* Wrappers for arch boot code to manipulate normally-constant masks */ -- cgit v1.2.3 From d81603b32cde95e5262c84004081b2e3cb4f23ed Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 28 Jan 2025 11:46:30 -0500 Subject: objpool: rework objpool_pop() The function has to track number of iterations to prevent an infinite loop. for_each_cpu_wrap() macro takes care of it, which simplifies user code. Signed-off-by: Yury Norov --- include/linux/objpool.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/objpool.h b/include/linux/objpool.h index cb1758eaa2d3..b713a1fe7521 100644 --- a/include/linux/objpool.h +++ b/include/linux/objpool.h @@ -170,17 +170,16 @@ static inline void *objpool_pop(struct objpool_head *pool) { void *obj = NULL; unsigned long flags; - int i, cpu; + int start, cpu; /* disable local irq to avoid preemption & interruption */ raw_local_irq_save(flags); - cpu = raw_smp_processor_id(); - for (i = 0; i < pool->nr_possible_cpus; i++) { + start = raw_smp_processor_id(); + for_each_possible_cpu_wrap(cpu, start) { obj = __objpool_try_get_slot(pool, cpu); if (obj) break; - cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); } raw_local_irq_restore(flags); -- cgit v1.2.3 From bf8b4e49777d944f84cf7d47360fe80dd3f69d96 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Sun, 2 Feb 2025 22:48:47 +0900 Subject: soc: apple: rtkit: Pass the crashlog to the crashed() callback Client drivers might want a copy of the crashlog to stash into a devcoredump blob. Since device memory management can be very variable, the actual devcoredump implementation is left to client drivers. Pass the raw crashlog buffer to the client callback so it can use it if desired. Signed-off-by: Asahi Lina Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20250202-rtkit-crashdump-v1-1-9d38615b4e12@asahilina.net Signed-off-by: Sven Peter --- include/linux/soc/apple/rtkit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h index c06d17599ae7..736f53018017 100644 --- a/include/linux/soc/apple/rtkit.h +++ b/include/linux/soc/apple/rtkit.h @@ -56,7 +56,7 @@ struct apple_rtkit_shmem { * context. */ struct apple_rtkit_ops { - void (*crashed)(void *cookie); + void (*crashed)(void *cookie, const void *crashlog, size_t crashlog_size); void (*recv_message)(void *cookie, u8 endpoint, u64 message); bool (*recv_message_early)(void *cookie, u8 endpoint, u64 message); int (*shmem_setup)(void *cookie, struct apple_rtkit_shmem *bfr); -- cgit v1.2.3 From bb519cf6113473c09d571e555137a36d7e2c8566 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 12 Feb 2025 14:03:08 -0500 Subject: ACPI: platform_profile: Improve platform_profile_unregister() Drivers usually call this method on error/exit paths and do not check for it's return value, which is always 0 anyway, so make it void. This is safe to do as currently all drivers use devm_platform_profile_register(). While at it, improve the style and make the function safer by checking for IS_ERR_OR_NULL before dereferencing the device pointer. Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Link: https://patch.msgid.link/20250212190308.21209-1-kuurtb@gmail.com [ rjw: Minor changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/platform_profile.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8ab5b0e8eb2c..d5499eca9e1d 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -47,7 +47,7 @@ struct platform_profile_ops { struct device *platform_profile_register(struct device *dev, const char *name, void *drvdata, const struct platform_profile_ops *ops); -int platform_profile_remove(struct device *dev); +void platform_profile_remove(struct device *dev); struct device *devm_platform_profile_register(struct device *dev, const char *name, void *drvdata, const struct platform_profile_ops *ops); -- cgit v1.2.3 From 258e231dc29fbd72bc82c16859a8304f71780ba2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 17 Feb 2025 21:03:01 +0100 Subject: PM: Rearrange documentation related to __pm_runtime_disable() There are only two callers of __pm_runtime_disable(), one of which is device_suspend_late() and the other is pm_runtime_disable() that has its own kerneldoc comment and there are no plans to add any more of them. Since they use different values of the __pm_runtime_disable() second parameter, the actual code behavior is different in each case, but it is all documented in the __pm_runtime_disable() kerneldoc comment which is not particularly straightforward. For this reason, move the information from the __pm_runtime_disable() kerneldoc comment to the pm_runtime_disable() one and into a separate comment in device_suspend_late() and remove the __pm_runtime_disable() kerneldoc comment altogether. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/12617588.O9o76ZdvQC@rjwysocki.net --- include/linux/pm_runtime.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d39dc863f612..72c62e1171ca 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -556,11 +556,18 @@ static inline int pm_runtime_set_suspended(struct device *dev) * pm_runtime_disable - Disable runtime PM for a device. * @dev: Target device. * - * Prevent the runtime PM framework from working with @dev (by incrementing its - * "blocking" counter). - * - * For each invocation of this function for @dev there must be a matching - * pm_runtime_enable() call in order for runtime PM to be enabled for it. + * Prevent the runtime PM framework from working with @dev by incrementing its + * "disable" counter. + * + * If the counter is zero when this function runs and there is a pending runtime + * resume request for @dev, it will be resumed. If the counter is still zero at + * that point, all of the pending runtime PM requests for @dev will be canceled + * and all runtime PM operations in progress involving it will be waited for to + * complete. + * + * For each invocation of this function for @dev, there must be a matching + * pm_runtime_enable() call, so that runtime PM is eventually enabled for it + * again. */ static inline void pm_runtime_disable(struct device *dev) { -- cgit v1.2.3 From 3e5eee147b7b0f5a93f56beffe34e81fdd00fa0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Feb 2025 21:11:42 +0100 Subject: PM: Block enabling of runtime PM during system suspend If device_prepare() runs on a device that has never had runtime PM enabled so far, it may reasonably assume that runtime PM will not be enabled for that device during the system suspend-resume cycle currently in progress, but this has never been guaranteed. To verify this assumption, make device_prepare() arrange for triggering a device warning accompanied by a call trace dump if runtime PM is enabled for such a device after it has returned. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/6131109.lOV4Wx5bFT@rjwysocki.net --- include/linux/pm.h | 1 + include/linux/pm_runtime.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 78855d794342..6ca6f34c58c3 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -597,6 +597,7 @@ enum rpm_status { RPM_RESUMING, RPM_SUSPENDED, RPM_SUSPENDING, + RPM_BLOCKED, }; /* diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 72c62e1171ca..10769119867b 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -77,6 +77,8 @@ extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); +extern void pm_runtime_block_if_disabled(struct device *dev); +extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); extern void pm_runtime_allow(struct device *dev); @@ -271,6 +273,8 @@ static inline int pm_runtime_get_if_active(struct device *dev) static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } static inline int pm_runtime_barrier(struct device *dev) { return 0; } +static inline void pm_runtime_block_if_disabled(struct device *dev) {} +static inline void pm_runtime_unblock(struct device *dev) {} static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} static inline void pm_runtime_allow(struct device *dev) {} -- cgit v1.2.3 From 758cc55ce3d5d79e8f98adbd03ad2cd29133af33 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Feb 2025 21:13:09 +0100 Subject: PM: runtime: Introduce pm_runtime_blocked() Introduce a new helper function called pm_runtime_blocked() for checking the power.last_status value indicating whether or not enabling runtime PM for the given device has been blocked (which happens in the "prepare" phase of system-wide suspend if runtime PM is disabled for the given device at that point). Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/4632087.LvFx2qVVIh@rjwysocki.net --- include/linux/pm_runtime.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 10769119867b..aea0395c10a1 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -81,6 +81,7 @@ extern void pm_runtime_block_if_disabled(struct device *dev); extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); +extern bool pm_runtime_blocked(struct device *dev); extern void pm_runtime_allow(struct device *dev); extern void pm_runtime_forbid(struct device *dev); extern void pm_runtime_no_callbacks(struct device *dev); @@ -277,6 +278,7 @@ static inline void pm_runtime_block_if_disabled(struct device *dev) {} static inline void pm_runtime_unblock(struct device *dev) {} static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} +static inline bool pm_runtime_blocked(struct device *dev) { return true; } static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} -- cgit v1.2.3 From 8a6a77bb5a41d5a91c6155e1b902d9f75b5bf9a6 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 16 Feb 2025 22:15:42 +0100 Subject: net: phy: move definition of phy_is_started before phy_disable_eee_mode In preparation of a follow-up patch, move phy_is_started() to before phy_disable_eee_mode(). Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/04d1e7a5-f4c0-42ab-8fa4-88ad26b74813@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 70c632799082..96ea56de71b2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1340,22 +1340,22 @@ void of_set_phy_timing_role(struct phy_device *phydev); int phy_speed_down_core(struct phy_device *phydev); /** - * phy_disable_eee_mode - Don't advertise an EEE mode. + * phy_is_started - Convenience function to check whether PHY is started * @phydev: The phy_device struct - * @link_mode: The EEE mode to be disabled */ -static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode) +static inline bool phy_is_started(struct phy_device *phydev) { - linkmode_set_bit(link_mode, phydev->eee_disabled_modes); + return phydev->state >= PHY_UP; } /** - * phy_is_started - Convenience function to check whether PHY is started + * phy_disable_eee_mode - Don't advertise an EEE mode. * @phydev: The phy_device struct + * @link_mode: The EEE mode to be disabled */ -static inline bool phy_is_started(struct phy_device *phydev) +static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode) { - return phydev->state >= PHY_UP; + linkmode_set_bit(link_mode, phydev->eee_disabled_modes); } void phy_resolve_aneg_pause(struct phy_device *phydev); -- cgit v1.2.3 From a9b6a860d7789d8183530aedbb46cf70f843e40d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 16 Feb 2025 22:16:34 +0100 Subject: net: phy: improve phy_disable_eee_mode If a mode is to be disabled, remove it from advertising_eee. Disabling EEE modes shall be done before calling phy_start(), warn if that's not the case. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/92164896-38ff-4474-b98b-e83fc05b9509@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 96ea56de71b2..0d5da01d275c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1355,7 +1355,10 @@ static inline bool phy_is_started(struct phy_device *phydev) */ static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode) { + WARN_ON(phy_is_started(phydev)); + linkmode_set_bit(link_mode, phydev->eee_disabled_modes); + linkmode_clear_bit(link_mode, phydev->advertising_eee); } void phy_resolve_aneg_pause(struct phy_device *phydev); -- cgit v1.2.3 From 809265fe96fe3eb7a85a9260356767587c482cb7 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 16 Feb 2025 22:20:07 +0100 Subject: net: phy: c45: remove local advertisement parameter from genphy_c45_eee_is_active After the last user has gone, we can remove the local advertisement parameter from genphy_c45_eee_is_active. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/bd121330-9e28-4bc8-8422-794bd54d561f@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 0d5da01d275c..584710e084eb 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2032,8 +2032,7 @@ int genphy_c45_plca_set_cfg(struct phy_device *phydev, const struct phy_plca_cfg *plca_cfg); int genphy_c45_plca_get_status(struct phy_device *phydev, struct phy_plca_status *plca_st); -int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, - unsigned long *lp); +int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *lp); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, -- cgit v1.2.3 From 5564ee3abb2ebece37000662a52eb6607b9c9f7d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 15 Feb 2025 03:03:59 -0800 Subject: bpf: use list_head to track explored states and free list The next patch in the set needs the ability to remove individual states from env->free_list while only holding a pointer to the state. Which requires env->free_list to be a doubly linked list. This patch converts env->free_list and struct bpf_verifier_state_list to use struct list_head for this purpose. The change to env->explored_states is collateral. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250215110411.3236773-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 32c23f2a3086..222f6af278ec 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -498,7 +498,7 @@ struct bpf_verifier_state { /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; - struct bpf_verifier_state_list *next; + struct list_head node; int miss_cnt, hit_cnt; }; @@ -710,8 +710,11 @@ struct bpf_verifier_env { bool test_state_freq; /* test verifier with different pruning frequency */ bool test_reg_invariants; /* fail verification on register invariants violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ - struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ - struct bpf_verifier_state_list *free_list; + /* Search pruning optimization, array of list_heads for + * lists of struct bpf_verifier_state_list. + */ + struct list_head *explored_states; + struct list_head free_list; /* list of struct bpf_verifier_state_list */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ u32 used_map_cnt; /* number of used maps */ -- cgit v1.2.3 From 408fcf946b2badb0a81c69ab837b6dfc0e76aa87 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 15 Feb 2025 03:04:00 -0800 Subject: bpf: free verifier states when they are no longer referenced When fixes from patches 1 and 3 are applied, Patrick Somaru reported an increase in memory consumption for sched_ext iterator-based programs hitting 1M instructions limit. For example, 2Gb VMs ran out of memory while verifying a program. Similar behaviour could be reproduced on current bpf-next master. Here is an example of such program: /* verification completes if given 16G or RAM, * final env->free_list size is 369,960 entries. */ SEC("raw_tp") __flag(BPF_F_TEST_STATE_FREQ) __success int free_list_bomb(const void *ctx) { volatile char buf[48] = {}; unsigned i, j; j = 0; bpf_for(i, 0, 10) { /* this forks verifier state: * - verification of current path continues and * creates a checkpoint after 'if'; * - verification of forked path hits the * checkpoint and marks it as loop_entry. */ if (bpf_get_prandom_u32()) asm volatile (""); /* this marks 'j' as precise, thus any checkpoint * created on current iteration would not be matched * on the next iteration. */ buf[j++] = 42; j %= ARRAY_SIZE(buf); } asm volatile (""::"r"(buf)); return 0; } Memory consumption increased due to more states being marked as loop entries and eventually added to env->free_list. This commit introduces logic to free states from env->free_list during verification. A state in env->free_list can be freed if: - it has no child states; - it is not used as a loop_entry. This commit: - updates bpf_verifier_state->used_as_loop_entry to be a counter that tracks how many states use this one as a loop entry; - adds a function maybe_free_verifier_state(), which: - frees a state if its ->branches and ->used_as_loop_entry counters are both zero; - if the state is freed, state->loop_entry->used_as_loop_entry is decremented, and an attempt is made to free state->loop_entry. In the example above, this approach reduces the maximum number of states in the free list from 369,960 to 16,223. However, this approach has its limitations. If the buf size in the example above is modified to 64, state caching overflows: the state for j=0 is evicted from the cache before it can be used to stop traversal. As a result, states in the free list accumulate because their branch counters do not reach zero. The effect of this patch on the selftests looks as follows: File Program Max free list (A) Max free list (B) Max free list (DIFF) -------------------------------- ------------------------------------ ----------------- ----------------- -------------------- arena_list.bpf.o arena_list_add 17 3 -14 (-82.35%) bpf_iter_task_stack.bpf.o dump_task_stack 39 9 -30 (-76.92%) iters.bpf.o checkpoint_states_deletion 265 89 -176 (-66.42%) iters.bpf.o clean_live_states 19 0 -19 (-100.00%) profiler2.bpf.o tracepoint__syscalls__sys_enter_kill 102 1 -101 (-99.02%) profiler3.bpf.o tracepoint__syscalls__sys_enter_kill 144 0 -144 (-100.00%) pyperf600_iter.bpf.o on_event 15 0 -15 (-100.00%) pyperf600_nounroll.bpf.o on_event 1170 1158 -12 (-1.03%) setget_sockopt.bpf.o skops_sockopt 18 0 -18 (-100.00%) strobemeta_nounroll1.bpf.o on_event 147 83 -64 (-43.54%) strobemeta_nounroll2.bpf.o on_event 312 209 -103 (-33.01%) strobemeta_subprogs.bpf.o on_event 124 86 -38 (-30.65%) test_cls_redirect_subprogs.bpf.o cls_redirect 15 0 -15 (-100.00%) timer.bpf.o test1 30 15 -15 (-50.00%) Measured using "do-not-submit" patches from here: https://github.com/eddyz87/bpf/tree/get-loop-entry-hungup Reported-by: Patrick Somaru Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250215110411.3236773-10-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 222f6af278ec..f920af30eb06 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -427,11 +427,6 @@ struct bpf_verifier_state { bool active_rcu_lock; bool speculative; - /* If this state was ever pointed-to by other state's loop_entry field - * this flag would be set to true. Used to avoid freeing such states - * while they are still in use. - */ - bool used_as_loop_entry; bool in_sleepable; /* first and last insn idx of this verifier state */ @@ -458,6 +453,11 @@ struct bpf_verifier_state { u32 dfs_depth; u32 callback_unroll_depth; u32 may_goto_depth; + /* If this state was ever pointed-to by other state's loop_entry field + * this flag would be set to true. Used to avoid freeing such states + * while they are still in use. + */ + u32 used_as_loop_entry; }; #define bpf_get_spilled_reg(slot, frame, mask) \ @@ -499,7 +499,9 @@ struct bpf_verifier_state { struct bpf_verifier_state_list { struct bpf_verifier_state state; struct list_head node; - int miss_cnt, hit_cnt; + u32 miss_cnt; + u32 hit_cnt:31; + u32 in_free_list:1; }; struct bpf_loop_inline_state { -- cgit v1.2.3 From 574078b001cdf6dfa4cf8a2f7373a9babdcc26c7 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 15 Feb 2025 03:04:01 -0800 Subject: bpf: fix env->peak_states computation Compute env->peak_states as a maximum value of sum of env->explored_states and env->free_list size. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250215110411.3236773-11-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f920af30eb06..bbd013c38ff9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -772,6 +772,8 @@ struct bpf_verifier_env { u32 peak_states; /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; + u32 free_list_size; + u32 explored_states_size; bpfptr_t fd_array; /* bit mask to keep track of whether a register has been accessed -- cgit v1.2.3 From bca84a7b93fdc744d79d94423c2cb905b1832310 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Feb 2025 21:16:48 +0100 Subject: PM: sleep: Use DPM_FLAG_SMART_SUSPEND conditionally A recent discussion has revealed that using DPM_FLAG_SMART_SUSPEND unconditionally is generally problematic because it may lead to situations in which the device's runtime PM information is internally inconsistent or does not reflect its real state [1]. For this reason, change the handling of DPM_FLAG_SMART_SUSPEND so that it is only taken into account if it is consistently set by the drivers of all devices having any PM callbacks throughout dependency graphs in accordance with the following rules: - The "smart suspend" feature is only enabled for devices whose drivers ask for it (that is, set DPM_FLAG_SMART_SUSPEND) and for devices without PM callbacks unless they have never had runtime PM enabled. - The "smart suspend" feature is not enabled for a device if it has not been enabled for the device's parent unless the parent does not take children into account or it has never had runtime PM enabled. - The "smart suspend" feature is not enabled for a device if it has not been enabled for one of the device's suppliers taking runtime PM into account unless that supplier has never had runtime PM enabled. Namely, introduce a new device PM flag called smart_suspend that is only set if the above conditions are met and update all DPM_FLAG_SMART_SUSPEND users to check power.smart_suspend instead of directly checking the latter. At the same time, drop the power.set_active flage introduced recently in commit 3775fc538f53 ("PM: sleep: core: Synchronize runtime PM status of parents and children") because it is now sufficient to check power.smart_suspend along with the dev_pm_skip_resume() return value to decide whether or not pm_runtime_set_active() needs to be called for the device. Link: https://lore.kernel.org/linux-pm/CAPDyKFroyU3YDSfw_Y6k3giVfajg3NQGwNWeteJWqpW29BojhQ@mail.gmail.com/ [1] Fixes: 7585946243d6 ("PM: sleep: core: Restrict power.set_active propagation") Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Acked-by: Bjorn Helgaas # drivers/pci Link: https://patch.msgid.link/1914558.tdWV9SEqCh@rjwysocki.net --- include/linux/device.h | 9 +++++++++ include/linux/pm.h | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 80a5b3268986..615282365052 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1025,6 +1025,15 @@ static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags) return !!(dev->power.driver_flags & flags); } +static inline bool dev_pm_smart_suspend(struct device *dev) +{ +#ifdef CONFIG_PM_SLEEP + return dev->power.smart_suspend; +#else + return false; +#endif +} + static inline void device_lock(struct device *dev) { mutex_lock(&dev->mutex); diff --git a/include/linux/pm.h b/include/linux/pm.h index 6ca6f34c58c3..24647108f0ad 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -680,8 +680,8 @@ struct dev_pm_info { bool syscore:1; bool no_pm_callbacks:1; /* Owned by the PM core */ bool async_in_progress:1; /* Owned by the PM core */ + bool smart_suspend:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ - bool set_active:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ #else bool should_wakeup:1; -- cgit v1.2.3 From 5bbd6e863b15a85221e49b9bdb2d5d8f0bb91f3d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2025 15:00:02 -0500 Subject: SUNRPC: Prevent looping due to rpc_signal_task() races If rpc_signal_task() is called while a task is in an rpc_call_done() callback function, and the latter calls rpc_restart_call(), the task can end up looping due to the RPC_TASK_SIGNALLED flag being set without the tk_rpc_status being set. Removing the redundant mechanism for signalling the task fixes the looping behaviour. Reported-by: Li Lingfeng Fixes: 39494194f93b ("SUNRPC: Fix races with rpc_killall_tasks()") Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index fec1e8a1570c..eac57914dcf3 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -158,7 +158,6 @@ enum { RPC_TASK_NEED_XMIT, RPC_TASK_NEED_RECV, RPC_TASK_MSG_PIN_WAIT, - RPC_TASK_SIGNALLED, }; #define rpc_test_and_set_running(t) \ @@ -171,7 +170,7 @@ enum { #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) -#define RPC_SIGNALLED(t) test_bit(RPC_TASK_SIGNALLED, &(t)->tk_runstate) +#define RPC_SIGNALLED(t) (READ_ONCE(task->tk_rpc_status) == -ERESTARTSYS) /* * Task priorities. -- cgit v1.2.3 From 903599768a2c39bdd9976d41b4cbc89fbfc55e38 Mon Sep 17 00:00:00 2001 From: "Sicelo A. Mhlongo" Date: Fri, 7 Feb 2025 23:15:13 +0200 Subject: power: supply: bq27xxx: Add voltage_max_design property for bq270x0 and bq27x10 Report VOLTAGE_MAX_DESIGN for the bq27x00 and bq27x10 fuel gauges. Per the datasheet, this value is stored in the Charge Termination Voltage Settings (QV0 and QV1) of the Pack Configuration register. Tested on the Nokia N900 with bq27200. Signed-off-by: Sicelo A. Mhlongo Link: https://lore.kernel.org/r/20250207211521.103357-1-absicsz@gmail.com Signed-off-by: Sebastian Reichel --- include/linux/power/bq27xxx_battery.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index 6b190639b08e..d56e1276aafe 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -62,6 +62,7 @@ struct bq27xxx_device_info { struct bq27xxx_reg_cache cache; int charge_design_full; int voltage_min_design; + int voltage_max_design; bool removed; unsigned long last_update; union power_supply_propval last_status; -- cgit v1.2.3 From f0b79944e6f41b1a242b128fb2702378eb48fbd2 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Thu, 13 Feb 2025 07:21:23 -0800 Subject: mm: Add copy_remote_vm_str() for readng C strings from remote VM Similar to `access_process_vm()` but specific to strings. Also chunks reads by page and utilizes `strscpy()` for handling null termination. The primary motivation for this change is to copy strings from a non-current task/process in BPF. There is already a helper `bpf_copy_from_user_task()`, which uses `access_process_vm()` but one to handle strings would be very helpful. Signed-off-by: Jordan Rome Signed-off-by: Andrii Nakryiko Reviewed-by: Shakeel Butt Link: https://lore.kernel.org/bpf/20250213152125.1837400-1-linux@jordanrome.com --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..5409c7b50509 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2486,6 +2486,11 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); +#ifdef CONFIG_BPF_SYSCALL +extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); +#endif + long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, -- cgit v1.2.3 From ac9a8587edc78f3a66fdf6a99973ef151cbff72a Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 18 Feb 2025 10:24:39 +0000 Subject: net: stmmac: "speed" passed to fix_mac_speed is an int priv->plat->fix_mac_speed() is called from stmmac_mac_link_up(), which is passed the speed as an "int". However, fix_mac_speed() implicitly casts this to an unsigned int. Some platform glue code print this value using %u, others with %d. Some implicitly cast it back to an int, and others to u32. Good practice is to use one type and only one type to represent a value being passed around a driver. Switch all of these over to consistently use "int" when dealing with a speed passed from stmmac_mac_link_up(), even though the speed will always be positive. Signed-off-by: Russell King (Oracle) Acked-by: Chen-Yu Tsai Reviewed-by: Andrew Lunn Acked-by: Nobuhiro Iwamatsu Link: https://patch.msgid.link/E1tkKmN-004ObM-Ge@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 24422ac4e417..6d2aa77ea963 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -231,7 +231,7 @@ struct plat_stmmacenet_data { u8 tx_sched_algorithm; struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; - void (*fix_mac_speed)(void *priv, unsigned int speed, unsigned int mode); + void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); -- cgit v1.2.3 From ae3a4f1fdc2cfad089e79e2ee4697f84941528d3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Feb 2025 10:22:26 -0700 Subject: eventpoll: add epoll_sendevents() helper Basic helper that copies ready events to the specified userspace address. The event checking is quick and racy, it's up to the caller to ensure it retries appropriately in case 0 events are copied. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250219172552.1565603-4-axboe@kernel.dk Signed-off-by: Christian Brauner --- include/linux/eventpoll.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 0c0d00fcd131..ccb478eb174b 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -25,6 +25,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); +/* Copy ready events to userspace */ +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents); + /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup -- cgit v1.2.3 From 8fd74a31eaf3def0d57264ada57fc981902aeadf Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sat, 8 Feb 2025 22:15:26 +0800 Subject: driver core: class: Remove needless return in void API class_remove_file() Remove return since both class_remove_file() and class_remove_file_ns() are void functions. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250208-cls_rmv_return-v1-1-091b37945aac@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 45ee3a634999..65880e60c720 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -193,7 +193,7 @@ static inline int __must_check class_create_file(const struct class *class, static inline void class_remove_file(const struct class *class, const struct class_attribute *attr) { - return class_remove_file_ns(class, attr, NULL); + class_remove_file_ns(class, attr, NULL); } /* Simple class attribute that is just a static string */ -- cgit v1.2.3 From a44073c28bc6d4118891d61e31c9fa9dc4333dc0 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sat, 8 Feb 2025 23:18:39 +0800 Subject: driver core: Remove needless return in void API device_remove_group() Remove return since both device_remove_group() and device_remove_groups() are void functions. Fixes: e323b2dddc1c ("driver core: add device_{add|remove}_group() helpers") Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250208-fix_device_remove_group-v1-1-8a5b0ac0ce5c@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 80a5b3268986..605b60254f6d 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1268,7 +1268,7 @@ static inline void device_remove_group(struct device *dev, { const struct attribute_group *groups[] = { grp, NULL }; - return device_remove_groups(dev, groups); + device_remove_groups(dev, groups); } int __must_check devm_device_add_group(struct device *dev, -- cgit v1.2.3 From 1c000dcaad2bef20189f3868207f515ef4b637ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Feb 2025 14:26:48 +0530 Subject: irqchip/irq-msi-lib: Optionally set default irq_eoi()/irq_ack() msi_lib_init_dev_msi_info() sets the default irq_eoi()/irq_ack() callbacks unconditionally. This is correct for all existing users, but prevents the IMSIC driver to be moved to the MSI library implementation. Introduce chip_flags in struct msi_parent_ops, which instruct the library to selectively set the callbacks depending on the flags, and update all current users to set them. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250217085657.789309-3-apatel@ventanamicro.com --- include/linux/msi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index b10093c4d00e..9abef442c146 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -558,11 +558,21 @@ enum { MSI_FLAG_NO_AFFINITY = (1 << 21), }; +/* + * Flags for msi_parent_ops::chip_flags + */ +enum { + MSI_CHIP_FLAG_SET_EOI = (1 << 0), + MSI_CHIP_FLAG_SET_ACK = (1 << 1), +}; + /** * struct msi_parent_ops - MSI parent domain callbacks and configuration info * * @supported_flags: Required: The supported MSI flags of the parent domain * @required_flags: Optional: The required MSI flags of the parent MSI domain + * @chip_flags: Optional: Select MSI chip callbacks to update with defaults + * in msi_lib_init_dev_msi_info(). * @bus_select_token: Optional: The bus token of the real parent domain for * irq_domain::select() * @bus_select_mask: Optional: A mask of supported BUS_DOMAINs for @@ -575,6 +585,7 @@ enum { struct msi_parent_ops { u32 supported_flags; u32 required_flags; + u32 chip_flags; u32 bus_select_token; u32 bus_select_mask; const char *prefix; -- cgit v1.2.3 From 751dc837dabd275d0ab165fc737c10f80e2e863a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 17 Feb 2025 14:26:50 +0530 Subject: genirq: Introduce common irq_force_complete_move() implementation CONFIG_GENERIC_PENDING_IRQ requires an architecture specific implementation of irq_force_complete_move() for CPU hotplug. At the moment, only x86 implements this unconditionally, but for RISC-V irq_force_complete_move() is only needed when the RISC-V IMSIC driver is in use and not needed otherwise. To allow runtime configuration of this mechanism, introduce a common irq_force_complete_move() implementation in the interrupt core code, which only invokes the completion function, when a interrupt chip in the hierarchy implements it. Switch X86 over to the new mechanism. No functional change intended. Signed-off-by: Thomas Gleixner Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250217085657.789309-5-apatel@ventanamicro.com --- include/linux/irq.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 8daa17f0107a..56f6583093d2 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -486,6 +486,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @ipi_send_mask: send an IPI to destination cpus in cpumask * @irq_nmi_setup: function called from core code before enabling an NMI * @irq_nmi_teardown: function called from core code after disabling an NMI + * @irq_force_complete_move: optional function to force complete pending irq move * @flags: chip specific flags */ struct irq_chip { @@ -537,6 +538,8 @@ struct irq_chip { int (*irq_nmi_setup)(struct irq_data *data); void (*irq_nmi_teardown)(struct irq_data *data); + void (*irq_force_complete_move)(struct irq_data *data); + unsigned long flags; }; @@ -619,11 +622,9 @@ static inline void irq_move_irq(struct irq_data *data) __irq_move_irq(data); } void irq_move_masked_irq(struct irq_data *data); -void irq_force_complete_move(struct irq_desc *desc); #else static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } -static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif extern int no_irq_affinity; -- cgit v1.2.3 From e54b1b5e89ae765e6d71d41883a8f551fde8d0ab Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 17 Feb 2025 14:26:51 +0530 Subject: genirq: Introduce irq_can_move_in_process_context() Interrupt controller drivers which enable CONFIG_GENERIC_PENDING_IRQ require to know whether an interrupt can be moved in process context or not to decide whether they need to invoke the work around for non-atomic MSI updates or not. This information can be retrieved via irq_can_move_pcntxt(). That helper requires access to the top-most interrupt domain data, but the driver which requires this is usually further down in the hierarchy. Introduce irq_can_move_in_process_context() which retrieves that information from the top-most interrupt domain data. [ tglx: Massaged change log ] Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250217085657.789309-6-apatel@ventanamicro.com --- include/linux/irq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 56f6583093d2..dd5df1e2d032 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -615,6 +615,7 @@ extern int irq_affinity_online_cpu(unsigned int cpu); #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) +bool irq_can_move_in_process_context(struct irq_data *data); void __irq_move_irq(struct irq_data *data); static inline void irq_move_irq(struct irq_data *data) { @@ -623,6 +624,7 @@ static inline void irq_move_irq(struct irq_data *data) } void irq_move_masked_irq(struct irq_data *data); #else +static inline bool irq_can_move_in_process_context(struct irq_data *data) { return true; } static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } #endif -- cgit v1.2.3 From 10d43ecbb0121fee8ddbc1e9755edc0402e458ca Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 30 Jan 2025 01:26:54 +0000 Subject: mei: Remove unused functions The following functions have been in the mei code for a long time but have never been used. mei_txe_setup_satt2() was added in 2014 by commit 32e2b59fca2c ("mei: txe: add hw-txe.c") mei_me_cl_rm_by_uuid_id() was added in 2015 by commit 79563db9ddd3 ("mei: add reference counting for me clients") mei_cldev_uuid() was added in 2015 by commit baeacd037697 ("mei: bus: export uuid and protocol version to mei_cl bus drivers") mei_cldev_recv_nonblock() was added in 2016 by commit 076802d00615 ("mei: bus: enable non-blocking RX") it is the only user of mei_cldev_recv_nonblock_vtag(). Remove them. Signed-off-by: Dr. David Alan Gilbert Acked-by: Arnd Bergmann Reviewed-by: Alexander Usyskin Link: https://lore.kernel.org/r/20250130012654.255119-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mei_cl_bus.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h index b38a56a13f39..725fd7727422 100644 --- a/include/linux/mei_cl_bus.h +++ b/include/linux/mei_cl_bus.h @@ -97,8 +97,6 @@ ssize_t mei_cldev_send(struct mei_cl_device *cldev, const u8 *buf, ssize_t mei_cldev_send_timeout(struct mei_cl_device *cldev, const u8 *buf, size_t length, unsigned long timeout); ssize_t mei_cldev_recv(struct mei_cl_device *cldev, u8 *buf, size_t length); -ssize_t mei_cldev_recv_nonblock(struct mei_cl_device *cldev, u8 *buf, - size_t length); ssize_t mei_cldev_recv_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length, unsigned long timeout); ssize_t mei_cldev_send_vtag(struct mei_cl_device *cldev, const u8 *buf, @@ -107,8 +105,6 @@ ssize_t mei_cldev_send_vtag_timeout(struct mei_cl_device *cldev, const u8 *buf, size_t length, u8 vtag, unsigned long timeout); ssize_t mei_cldev_recv_vtag(struct mei_cl_device *cldev, u8 *buf, size_t length, u8 *vtag); -ssize_t mei_cldev_recv_nonblock_vtag(struct mei_cl_device *cldev, u8 *buf, - size_t length, u8 *vtag); ssize_t mei_cldev_recv_vtag_timeout(struct mei_cl_device *cldev, u8 *buf, size_t length, u8 *vtag, unsigned long timeout); @@ -116,7 +112,6 @@ int mei_cldev_register_rx_cb(struct mei_cl_device *cldev, mei_cldev_cb_t rx_cb); int mei_cldev_register_notif_cb(struct mei_cl_device *cldev, mei_cldev_cb_t notif_cb); -const uuid_le *mei_cldev_uuid(const struct mei_cl_device *cldev); u8 mei_cldev_ver(const struct mei_cl_device *cldev); void *mei_cldev_get_drvdata(const struct mei_cl_device *cldev); -- cgit v1.2.3 From b4c173dfbb6c78568578ff18f9e8822d7bd0e31b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Feb 2025 11:02:58 +0100 Subject: fuse: don't truncate cached, mutated symlink Fuse allows the value of a symlink to change and this property is exploited by some filesystems (e.g. CVMFS). It has been observed, that sometimes after changing the symlink contents, the value is truncated to the old size. This is caused by fuse_getattr() racing with fuse_reverse_inval_inode(). fuse_reverse_inval_inode() updates the fuse_inode's attr_version, which results in fuse_change_attributes() exiting before updating the cached attributes This is okay, as the cached attributes remain invalid and the next call to fuse_change_attributes() will likely update the inode with the correct values. The reason this causes problems is that cached symlinks will be returned through page_get_link(), which truncates the symlink to inode->i_size. This is correct for filesystems that don't mutate symlinks, but in this case it causes bad behavior. The solution is to just remove this truncation. This can cause a regression in a filesystem that relies on supplying a symlink larger than the file size, but this is unlikely. If that happens we'd need to make this behavior conditional. Reported-by: Laura Promberger Tested-by: Sam Lewis Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250220100258.793363-1-mszeredi@redhat.com Reviewed-by: Bernd Schubert Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c3b2f8a621f..9346adf28f7b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3452,6 +3452,8 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); +extern const char *page_get_link_raw(struct dentry *, struct inode *, + struct delayed_call *); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); -- cgit v1.2.3 From bfad07fe298bfba0c7ddab87c5b5325970203a1e Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Tue, 4 Feb 2025 09:58:32 -0600 Subject: mfd: axp20x: AXP717: Add AXP717_TS_PIN_CFG to writeable regs Add AXP717_TS_PIN_CFG (register 0x50) to the table of writeable registers so that the temperature sensor can be configured by the battery driver. Signed-off-by: Chris Morgan Link: https://lore.kernel.org/r/20250204155835.161973-3-macroalpha82@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/axp20x.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index c3df0e615fbf..3c5aecf1d4b5 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -137,6 +137,7 @@ enum axp20x_variants { #define AXP717_IRQ2_STATE 0x4a #define AXP717_IRQ3_STATE 0x4b #define AXP717_IRQ4_STATE 0x4c +#define AXP717_TS_PIN_CFG 0x50 #define AXP717_ICC_CHG_SET 0x62 #define AXP717_ITERM_CHG_SET 0x63 #define AXP717_CV_CHG_SET 0x64 -- cgit v1.2.3 From a6a494c8e3ce1fe84aac538b087a4cab868ed83f Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 23 Jan 2025 18:04:28 +0300 Subject: power: supply: max77705: Add charger driver for Maxim 77705 Add driver for Maxim 77705 switch-mode charger. It providing power supply class information to userspace. The driver is configured through DTS (battery and system related settings). Signed-off-by: Dzmitry Sankouski Reviewed-by: Krzysztof Kozlowski Acked-by: Sebastian Reichel Link: https://lore.kernel.org/r/20250123-starqltechn_integration_upstream-v17-3-8b06685b6612@gmail.com Signed-off-by: Lee Jones --- include/linux/power/max77705_charger.h | 195 +++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 include/linux/power/max77705_charger.h (limited to 'include/linux') diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h new file mode 100644 index 000000000000..fdec9af9c541 --- /dev/null +++ b/include/linux/power/max77705_charger.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Maxim MAX77705 definitions. + * + * Copyright (C) 2015 Samsung Electronics, Inc. + * Copyright (C) 2025 Dzmitry Sankouski + */ + +#ifndef __MAX77705_CHARGER_H +#define __MAX77705_CHARGER_H __FILE__ + +/* MAX77705_CHG_REG_CHG_INT */ +#define MAX77705_BYP_I BIT(0) +#define MAX77705_INP_LIMIT_I BIT(1) +#define MAX77705_BATP_I BIT(2) +#define MAX77705_BAT_I BIT(3) +#define MAX77705_CHG_I BIT(4) +#define MAX77705_WCIN_I BIT(5) +#define MAX77705_CHGIN_I BIT(6) +#define MAX77705_AICL_I BIT(7) + +/* MAX77705_CHG_REG_CHG_INT_MASK */ +#define MAX77705_BYP_IM BIT(0) +#define MAX77705_INP_LIMIT_IM BIT(1) +#define MAX77705_BATP_IM BIT(2) +#define MAX77705_BAT_IM BIT(3) +#define MAX77705_CHG_IM BIT(4) +#define MAX77705_WCIN_IM BIT(5) +#define MAX77705_CHGIN_IM BIT(6) +#define MAX77705_AICL_IM BIT(7) + +/* MAX77705_CHG_REG_CHG_INT_OK */ +#define MAX77705_BYP_OK BIT(0) +#define MAX77705_DISQBAT_OK BIT(1) +#define MAX77705_BATP_OK BIT(2) +#define MAX77705_BAT_OK BIT(3) +#define MAX77705_CHG_OK BIT(4) +#define MAX77705_WCIN_OK BIT(5) +#define MAX77705_CHGIN_OK BIT(6) +#define MAX77705_AICL_OK BIT(7) + +/* MAX77705_CHG_REG_DETAILS_00 */ +#define MAX77705_BATP_DTLS BIT(0) +#define MAX77705_WCIN_DTLS GENMASK(4, 3) +#define MAX77705_WCIN_DTLS_SHIFT 3 +#define MAX77705_CHGIN_DTLS GENMASK(6, 5) +#define MAX77705_CHGIN_DTLS_SHIFT 5 + +/* MAX77705_CHG_REG_DETAILS_01 */ +#define MAX77705_CHG_DTLS GENMASK(3, 0) +#define MAX77705_CHG_DTLS_SHIFT 0 +#define MAX77705_BAT_DTLS GENMASK(6, 4) +#define MAX77705_BAT_DTLS_SHIFT 4 + +/* MAX77705_CHG_REG_DETAILS_02 */ +#define MAX77705_BYP_DTLS GENMASK(3, 0) +#define MAX77705_BYP_DTLS_SHIFT 0 + +/* MAX77705_CHG_REG_CNFG_00 */ +#define MAX77705_CHG_SHIFT 0 +#define MAX77705_UNO_SHIFT 1 +#define MAX77705_OTG_SHIFT 1 +#define MAX77705_BUCK_SHIFT 2 +#define MAX77705_BOOST_SHIFT 3 +#define MAX77705_WDTEN_SHIFT 4 +#define MAX77705_MODE_MASK GENMASK(3, 0) +#define MAX77705_CHG_MASK BIT(MAX77705_CHG_SHIFT) +#define MAX77705_UNO_MASK BIT(MAX77705_UNO_SHIFT) +#define MAX77705_OTG_MASK BIT(MAX77705_OTG_SHIFT) +#define MAX77705_BUCK_MASK BIT(MAX77705_BUCK_SHIFT) +#define MAX77705_BOOST_MASK BIT(MAX77705_BOOST_SHIFT) +#define MAX77705_WDTEN_MASK BIT(MAX77705_WDTEN_SHIFT) +#define MAX77705_UNO_CTRL (MAX77705_UNO_MASK | MAX77705_BOOST_MASK) +#define MAX77705_OTG_CTRL (MAX77705_OTG_MASK | MAX77705_BOOST_MASK) + +/* MAX77705_CHG_REG_CNFG_01 */ +#define MAX77705_FCHGTIME_SHIFT 0 +#define MAX77705_FCHGTIME_MASK GENMASK(2, 0) +#define MAX77705_CHG_RSTRT_SHIFT 4 +#define MAX77705_CHG_RSTRT_MASK GENMASK(5, 4) +#define MAX77705_FCHGTIME_DISABLE 0 +#define MAX77705_CHG_RSTRT_DISABLE 0x3 + +#define MAX77705_PQEN_SHIFT 7 +#define MAX77705_PQEN_MASK BIT(7) +#define MAX77705_CHG_PQEN_DISABLE 0 +#define MAX77705_CHG_PQEN_ENABLE 1 + +/* MAX77705_CHG_REG_CNFG_02 */ +#define MAX77705_OTG_ILIM_SHIFT 6 +#define MAX77705_OTG_ILIM_MASK GENMASK(7, 6) +#define MAX77705_OTG_ILIM_500 0 +#define MAX77705_OTG_ILIM_900 1 +#define MAX77705_OTG_ILIM_1200 2 +#define MAX77705_OTG_ILIM_1500 3 +#define MAX77705_CHG_CC GENMASK(5, 0) + +/* MAX77705_CHG_REG_CNFG_03 */ +#define MAX77705_TO_ITH_SHIFT 0 +#define MAX77705_TO_ITH_MASK GENMASK(2, 0) +#define MAX77705_TO_TIME_SHIFT 3 +#define MAX77705_TO_TIME_MASK GENMASK(5, 3) +#define MAX77705_SYS_TRACK_DIS_SHIFT 7 +#define MAX77705_SYS_TRACK_DIS_MASK BIT(7) +#define MAX77705_TO_ITH_150MA 0 +#define MAX77705_TO_TIME_30M 3 +#define MAX77705_SYS_TRACK_ENABLE 0 +#define MAX77705_SYS_TRACK_DISABLE 1 + +/* MAX77705_CHG_REG_CNFG_04 */ +#define MAX77705_CHG_MINVSYS_SHIFT 6 +#define MAX77705_CHG_MINVSYS_MASK GENMASK(7, 6) +#define MAX77705_CHG_PRM_SHIFT 0 +#define MAX77705_CHG_PRM_MASK GENMASK(5, 0) + +#define MAX77705_CHG_CV_PRM_SHIFT 0 +#define MAX77705_CHG_CV_PRM_MASK GENMASK(5, 0) + +/* MAX77705_CHG_REG_CNFG_05 */ +#define MAX77705_REG_B2SOVRC_SHIFT 0 +#define MAX77705_REG_B2SOVRC_MASK GENMASK(3, 0) +#define MAX77705_B2SOVRC_DISABLE 0 +#define MAX77705_B2SOVRC_4_5A 6 +#define MAX77705_B2SOVRC_4_8A 8 +#define MAX77705_B2SOVRC_5_0A 9 + +/* MAX77705_CHG_CNFG_06 */ +#define MAX77705_WDTCLR_SHIFT 0 +#define MAX77705_WDTCLR_MASK GENMASK(1, 0) +#define MAX77705_WDTCLR 1 +#define MAX77705_CHGPROT_MASK GENMASK(3, 2) +#define MAX77705_CHGPROT_UNLOCKED GENMASK(3, 2) +#define MAX77705_SLOWEST_LX_SLOPE GENMASK(6, 5) + +/* MAX77705_CHG_REG_CNFG_07 */ +#define MAX77705_CHG_FMBST 4 +#define MAX77705_REG_FMBST_SHIFT 2 +#define MAX77705_REG_FMBST_MASK BIT(MAX77705_REG_FMBST_SHIFT) +#define MAX77705_REG_FGSRC_SHIFT 1 +#define MAX77705_REG_FGSRC_MASK BIT(MAX77705_REG_FGSRC_SHIFT) + +/* MAX77705_CHG_REG_CNFG_08 */ +#define MAX77705_REG_FSW_SHIFT 0 +#define MAX77705_REG_FSW_MASK GENMASK(1, 0) +#define MAX77705_CHG_FSW_3MHz 0 +#define MAX77705_CHG_FSW_2MHz 1 +#define MAX77705_CHG_FSW_1_5MHz 2 + +/* MAX77705_CHG_REG_CNFG_09 */ +#define MAX77705_CHG_CHGIN_LIM_MASK GENMASK(6, 0) +#define MAX77705_CHG_EN_MASK BIT(7) +#define MAX77705_CHG_DISABLE 0 +#define MAX77705_CHARGER_CHG_CHARGING(_reg) \ + (((_reg) & MAX77705_CHG_EN_MASK) > 1) + + +/* MAX77705_CHG_REG_CNFG_10 */ +#define MAX77705_CHG_WCIN_LIM GENMASK(5, 0) + +/* MAX77705_CHG_REG_CNFG_11 */ +#define MAX77705_VBYPSET_SHIFT 0 +#define MAX77705_VBYPSET_MASK GENMASK(6, 0) + +/* MAX77705_CHG_REG_CNFG_12 */ +#define MAX77705_CHGINSEL_SHIFT 5 +#define MAX77705_CHGINSEL_MASK BIT(MAX77705_CHGINSEL_SHIFT) +#define MAX77705_WCINSEL_SHIFT 6 +#define MAX77705_WCINSEL_MASK BIT(MAX77705_WCINSEL_SHIFT) +#define MAX77705_VCHGIN_REG_MASK GENMASK(4, 3) +#define MAX77705_WCIN_REG_MASK GENMASK(2, 1) +#define MAX77705_REG_DISKIP_SHIFT 0 +#define MAX77705_REG_DISKIP_MASK BIT(MAX77705_REG_DISKIP_SHIFT) +/* REG=4.5V, UVLO=4.7V */ +#define MAX77705_VCHGIN_4_5 0 +/* REG=4.5V, UVLO=4.7V */ +#define MAX77705_WCIN_4_5 0 +#define MAX77705_DISABLE_SKIP 1 +#define MAX77705_AUTO_SKIP 0 + +/* uA */ +#define MAX77705_CURRENT_CHGIN_STEP 25000 +#define MAX77705_CURRENT_CHG_STEP 50000 +#define MAX77705_CURRENT_CHGIN_MIN 100000 +#define MAX77705_CURRENT_CHGIN_MAX 3200000 + +struct max77705_charger_data { + struct device *dev; + struct regmap *regmap; + struct power_supply_battery_info *bat_info; + struct workqueue_struct *wqueue; + struct work_struct chgin_work; + struct power_supply *psy_chg; +}; + +#endif /* __MAX77705_CHARGER_H */ -- cgit v1.2.3 From c8d50f029748b73313838b64b829992b66ccb704 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 23 Jan 2025 18:04:30 +0300 Subject: mfd: Add new driver for MAX77705 PMIC Add the core MFD driver for max77705 PMIC. Drivers for sub-devices will be added in subsequent patches. Signed-off-by: Dzmitry Sankouski Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250123-starqltechn_integration_upstream-v17-5-8b06685b6612@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/max77693-common.h | 4 +- include/linux/mfd/max77705-private.h | 178 +++++++++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 include/linux/mfd/max77705-private.h (limited to 'include/linux') diff --git a/include/linux/mfd/max77693-common.h b/include/linux/mfd/max77693-common.h index a5bce099f1ed..ec2e1b2dceb8 100644 --- a/include/linux/mfd/max77693-common.h +++ b/include/linux/mfd/max77693-common.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * Common data shared between Maxim 77693 and 77843 drivers + * Common data shared between Maxim 77693, 77705 and 77843 drivers * * Copyright (C) 2015 Samsung Electronics */ @@ -11,6 +11,7 @@ enum max77693_types { TYPE_MAX77693_UNKNOWN, TYPE_MAX77693, + TYPE_MAX77705, TYPE_MAX77843, TYPE_MAX77693_NUM, @@ -32,6 +33,7 @@ struct max77693_dev { struct regmap *regmap_muic; struct regmap *regmap_haptic; /* Only MAX77693 */ struct regmap *regmap_chg; /* Only MAX77843 */ + struct regmap *regmap_leds; /* Only MAX77705 */ struct regmap_irq_chip_data *irq_data_led; struct regmap_irq_chip_data *irq_data_topsys; diff --git a/include/linux/mfd/max77705-private.h b/include/linux/mfd/max77705-private.h new file mode 100644 index 000000000000..0c8b07abfb9d --- /dev/null +++ b/include/linux/mfd/max77705-private.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Maxim MAX77705 definitions. + * + * Copyright (C) 2015 Samsung Electronics, Inc. + * Copyright (C) 2025 Dzmitry Sankouski + */ + +#ifndef __LINUX_MFD_MAX77705_PRIV_H +#define __LINUX_MFD_MAX77705_PRIV_H + +#define MAX77705_SRC_IRQ_CHG BIT(0) +#define MAX77705_SRC_IRQ_TOP BIT(1) +#define MAX77705_SRC_IRQ_FG BIT(2) +#define MAX77705_SRC_IRQ_USBC BIT(3) +#define MAX77705_SRC_IRQ_ALL (MAX77705_SRC_IRQ_CHG | MAX77705_SRC_IRQ_TOP | \ + MAX77705_SRC_IRQ_FG | MAX77705_SRC_IRQ_USBC) + +/* MAX77705_PMIC_REG_PMICREV register */ +#define MAX77705_VERSION_SHIFT 3 +#define MAX77705_REVISION_MASK GENMASK(2, 0) +#define MAX77705_VERSION_MASK GENMASK(7, MAX77705_VERSION_SHIFT) +/* MAX77705_PMIC_REG_MAINCTRL1 register */ +#define MAX77705_MAINCTRL1_BIASEN_SHIFT 7 +#define MAX77705_MAINCTRL1_BIASEN_MASK BIT(MAX77705_MAINCTRL1_BIASEN_SHIFT) +/* MAX77705_PMIC_REG_MCONFIG2 (haptics) register */ +#define MAX77705_CONFIG2_MEN_SHIFT 6 +#define MAX77705_CONFIG2_MODE_SHIFT 7 +#define MAX77705_CONFIG2_HTYP_SHIFT 5 +/* MAX77705_PMIC_REG_SYSTEM_INT_MASK register */ +#define MAX77705_SYSTEM_IRQ_BSTEN_INT BIT(3) +#define MAX77705_SYSTEM_IRQ_SYSUVLO_INT BIT(4) +#define MAX77705_SYSTEM_IRQ_SYSOVLO_INT BIT(5) +#define MAX77705_SYSTEM_IRQ_TSHDN_INT BIT(6) +#define MAX77705_SYSTEM_IRQ_TM_INT BIT(7) + +enum max77705_hw_rev { + MAX77705_PASS1 = 1, + MAX77705_PASS2, + MAX77705_PASS3 +}; + +enum max77705_reg { + MAX77705_PMIC_REG_PMICID1 = 0x00, + MAX77705_PMIC_REG_PMICREV = 0x01, + MAX77705_PMIC_REG_MAINCTRL1 = 0x02, + MAX77705_PMIC_REG_BSTOUT_MASK = 0x03, + MAX77705_PMIC_REG_FORCE_EN_MASK = 0x08, + MAX77705_PMIC_REG_MCONFIG = 0x10, + MAX77705_PMIC_REG_MCONFIG2 = 0x11, + MAX77705_PMIC_REG_INTSRC = 0x22, + MAX77705_PMIC_REG_INTSRC_MASK = 0x23, + MAX77705_PMIC_REG_SYSTEM_INT = 0x24, + MAX77705_PMIC_REG_RESERVED_25 = 0x25, + MAX77705_PMIC_REG_SYSTEM_INT_MASK = 0x26, + MAX77705_PMIC_REG_RESERVED_27 = 0x27, + MAX77705_PMIC_REG_RESERVED_28 = 0x28, + MAX77705_PMIC_REG_RESERVED_29 = 0x29, + MAX77705_PMIC_REG_BOOSTCONTROL1 = 0x4C, + MAX77705_PMIC_REG_BOOSTCONTROL2 = 0x4F, + MAX77705_PMIC_REG_SW_RESET = 0x50, + MAX77705_PMIC_REG_USBC_RESET = 0x51, + + MAX77705_PMIC_REG_END +}; + +enum max77705_chg_reg { + MAX77705_CHG_REG_BASE = 0xB0, + MAX77705_CHG_REG_INT = 0, + MAX77705_CHG_REG_INT_MASK, + MAX77705_CHG_REG_INT_OK, + MAX77705_CHG_REG_DETAILS_00, + MAX77705_CHG_REG_DETAILS_01, + MAX77705_CHG_REG_DETAILS_02, + MAX77705_CHG_REG_DTLS_03, + MAX77705_CHG_REG_CNFG_00, + MAX77705_CHG_REG_CNFG_01, + MAX77705_CHG_REG_CNFG_02, + MAX77705_CHG_REG_CNFG_03, + MAX77705_CHG_REG_CNFG_04, + MAX77705_CHG_REG_CNFG_05, + MAX77705_CHG_REG_CNFG_06, + MAX77705_CHG_REG_CNFG_07, + MAX77705_CHG_REG_CNFG_08, + MAX77705_CHG_REG_CNFG_09, + MAX77705_CHG_REG_CNFG_10, + MAX77705_CHG_REG_CNFG_11, + + MAX77705_CHG_REG_CNFG_12, + MAX77705_CHG_REG_CNFG_13, + MAX77705_CHG_REG_CNFG_14, + MAX77705_CHG_REG_SAFEOUT_CTRL +}; + +enum max77705_fuelgauge_reg { + STATUS_REG = 0x00, + VALRT_THRESHOLD_REG = 0x01, + TALRT_THRESHOLD_REG = 0x02, + SALRT_THRESHOLD_REG = 0x03, + REMCAP_REP_REG = 0x05, + SOCREP_REG = 0x06, + TEMPERATURE_REG = 0x08, + VCELL_REG = 0x09, + TIME_TO_EMPTY_REG = 0x11, + FULLSOCTHR_REG = 0x13, + CURRENT_REG = 0x0A, + AVG_CURRENT_REG = 0x0B, + SOCMIX_REG = 0x0D, + SOCAV_REG = 0x0E, + REMCAP_MIX_REG = 0x0F, + FULLCAP_REG = 0x10, + RFAST_REG = 0x15, + AVR_TEMPERATURE_REG = 0x16, + CYCLES_REG = 0x17, + DESIGNCAP_REG = 0x18, + AVR_VCELL_REG = 0x19, + TIME_TO_FULL_REG = 0x20, + CONFIG_REG = 0x1D, + ICHGTERM_REG = 0x1E, + REMCAP_AV_REG = 0x1F, + FULLCAP_NOM_REG = 0x23, + LEARN_CFG_REG = 0x28, + FILTER_CFG_REG = 0x29, + MISCCFG_REG = 0x2B, + QRTABLE20_REG = 0x32, + FULLCAP_REP_REG = 0x35, + RCOMP_REG = 0x38, + VEMPTY_REG = 0x3A, + FSTAT_REG = 0x3D, + DISCHARGE_THRESHOLD_REG = 0x40, + QRTABLE30_REG = 0x42, + ISYS_REG = 0x43, + DQACC_REG = 0x45, + DPACC_REG = 0x46, + AVGISYS_REG = 0x4B, + QH_REG = 0x4D, + VSYS_REG = 0xB1, + TALRTTH2_REG = 0xB2, + VBYP_REG = 0xB3, + CONFIG2_REG = 0xBB, + IIN_REG = 0xD0, + OCV_REG = 0xEE, + VFOCV_REG = 0xFB, + VFSOC_REG = 0xFF, + + MAX77705_FG_END +}; + +enum max77705_led_reg { + MAX77705_RGBLED_REG_BASE = 0x30, + MAX77705_RGBLED_REG_LEDEN = 0, + MAX77705_RGBLED_REG_LED0BRT, + MAX77705_RGBLED_REG_LED1BRT, + MAX77705_RGBLED_REG_LED2BRT, + MAX77705_RGBLED_REG_LED3BRT, + MAX77705_RGBLED_REG_LEDRMP, + MAX77705_RGBLED_REG_LEDBLNK, + MAX77705_LED_REG_END +}; + +enum max77705_charger_battery_state { + MAX77705_BATTERY_NOBAT, + MAX77705_BATTERY_PREQUALIFICATION, + MAX77705_BATTERY_DEAD, + MAX77705_BATTERY_GOOD, + MAX77705_BATTERY_LOWVOLTAGE, + MAX77705_BATTERY_OVERVOLTAGE, + MAX77705_BATTERY_RESERVED +}; + +enum max77705_charger_charge_type { + MAX77705_CHARGER_CONSTANT_CURRENT = 1, + MAX77705_CHARGER_CONSTANT_VOLTAGE, + MAX77705_CHARGER_END_OF_CHARGE, + MAX77705_CHARGER_DONE +}; + +#endif /* __LINUX_MFD_MAX77705_PRIV_H */ -- cgit v1.2.3 From aebb5fc9a0d87916133b911e1ef2cc04a7996335 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 23 Jan 2025 18:04:32 +0300 Subject: leds: max77705: Add LEDs support This adds basic support for LEDs for the max77705 PMIC. Signed-off-by: Dzmitry Sankouski Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250123-starqltechn_integration_upstream-v17-7-8b06685b6612@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/max77705-private.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/max77705-private.h b/include/linux/mfd/max77705-private.h index 0c8b07abfb9d..214de7feeb8c 100644 --- a/include/linux/mfd/max77705-private.h +++ b/include/linux/mfd/max77705-private.h @@ -33,6 +33,23 @@ #define MAX77705_SYSTEM_IRQ_SYSOVLO_INT BIT(5) #define MAX77705_SYSTEM_IRQ_TSHDN_INT BIT(6) #define MAX77705_SYSTEM_IRQ_TM_INT BIT(7) +/* MAX77705_RGBLED_REG_LEDEN register */ +#define MAX77705_RGBLED_EN_WIDTH 2 +/* MAX77705_RGBLED_REG_LEDBLNK register */ +#define MAX77705_RGB_DELAY_100_STEP_LIM 500 +#define MAX77705_RGB_DELAY_100_STEP_COUNT 4 +#define MAX77705_RGB_DELAY_100_STEP 100 +#define MAX77705_RGB_DELAY_250_STEP_LIM 3250 +#define MAX77705_RGB_DELAY_250_STEP 250 +#define MAX77705_RGB_DELAY_500_STEP 500 +#define MAX77705_RGB_DELAY_500_STEP_COUNT 10 +#define MAX77705_RGB_DELAY_500_STEP_LIM 5000 +#define MAX77705_RGB_DELAY_1000_STEP_LIM 8000 +#define MAX77705_RGB_DELAY_1000_STEP_COUNT 13 +#define MAX77705_RGB_DELAY_1000_STEP 1000 +#define MAX77705_RGB_DELAY_2000_STEP 2000 +#define MAX77705_RGB_DELAY_2000_STEP_COUNT 13 +#define MAX77705_RGB_DELAY_2000_STEP_LIM 12000 enum max77705_hw_rev { MAX77705_PASS1 = 1, -- cgit v1.2.3 From 6866c91f8c2327546d850d5a85025fb81e2089bf Mon Sep 17 00:00:00 2001 From: Billy Tsai Date: Tue, 4 Feb 2025 17:17:01 +0800 Subject: i3c: Remove the const qualifier from i2c_msg pointer in i2c_xfers API The change is necessary to enable the use of the `i2c_get_dma_safe_msg_buf()` API, which requires a non-const `struct i2c_msg *` to operate. The `i2c_get_dma_safe_msg_buf()` function ensures safe handling of I2C messages when using DMA, making it essential for scenarios where DMA transfers are involved. By removing the `const` qualifier, this patch allows drivers to prepare and manage DMA-safe buffers directly. Signed-off-by: Billy Tsai Reviewed-by: Frank Li Reviewed-by: Wolfram Sang Acked-by: Mukesh Kumar Savaliya Link: https://lore.kernel.org/r/20250204091702.4014466-1-billy_tsai@aspeedtech.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 12d532b012c5..c67922ece617 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -475,7 +475,7 @@ struct i3c_master_controller_ops { int (*attach_i2c_dev)(struct i2c_dev_desc *dev); void (*detach_i2c_dev)(struct i2c_dev_desc *dev); int (*i2c_xfers)(struct i2c_dev_desc *dev, - const struct i2c_msg *xfers, int nxfers); + struct i2c_msg *xfers, int nxfers); int (*request_ibi)(struct i3c_dev_desc *dev, const struct i3c_ibi_setup *req); void (*free_ibi)(struct i3c_dev_desc *dev); -- cgit v1.2.3 From fd93eaffb3f977b23bc0a48d4c8616e654fcf133 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 20 Feb 2025 15:29:31 +0800 Subject: bpf: Prevent unsafe access to the sock fields in the BPF timestamping callback The subsequent patch will implement BPF TX timestamping. It will call the sockops BPF program without holding the sock lock. This breaks the current assumption that all sock ops programs will hold the sock lock. The sock's fields of the uapi's bpf_sock_ops requires this assumption. To address this, a new "u8 is_locked_tcp_sock;" field is added. This patch sets it in the current sock_ops callbacks. The "is_fullsock" test is then replaced by the "is_locked_tcp_sock" test during sock_ops_convert_ctx_access(). The new TX timestamping callbacks added in the subsequent patch will not have this set. This will prevent unsafe access from the new timestamping callbacks. Potentially, we could allow read-only access. However, this would require identifying which callback is read-safe-only and also requires additional BPF instruction rewrites in the covert_ctx. Since the BPF program can always read everything from a socket (e.g., by using bpf_core_cast), this patch keeps it simple and disables all read and write access to any socket fields through the bpf_sock_ops UAPI from the new TX timestamping callback. Moreover, note that some of the fields in bpf_sock_ops are specific to tcp_sock, and sock_ops currently only supports tcp_sock. In the future, UDP timestamping will be added, which will also break this assumption. The same idea used in this patch will be reused. Considering that the current sock_ops only supports tcp_sock, the variable is named is_locked_"tcp"_sock. Signed-off-by: Jason Xing Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250220072940.99994-4-kerneljasonxing@gmail.com --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index a3ea46281595..d36d5d5180b1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1508,6 +1508,7 @@ struct bpf_sock_ops_kern { void *skb_data_end; u8 op; u8 is_fullsock; + u8 is_locked_tcp_sock; u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling -- cgit v1.2.3 From 6b98ec7e882af1c3088a88757e2226d06c8514f9 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 20 Feb 2025 15:29:34 +0800 Subject: bpf: Add BPF_SOCK_OPS_TSTAMP_SCHED_CB callback Support SCM_TSTAMP_SCHED case for bpf timestamping. Add a new sock_ops callback, BPF_SOCK_OPS_TSTAMP_SCHED_CB. This callback will occur at the same timestamping point as the user space's SCM_TSTAMP_SCHED. The BPF program can use it to get the same SCM_TSTAMP_SCHED timestamp without modifying the user-space application. A new SKBTX_BPF flag is added to mark skb_shinfo(skb)->tx_flags, ensuring that the new BPF timestamping and the current user space's SO_TIMESTAMPING do not interfere with each other. Signed-off-by: Jason Xing Signed-off-by: Martin KaFai Lau Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250220072940.99994-7-kerneljasonxing@gmail.com --- include/linux/skbuff.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bb2b751d274a..52f6e033e704 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -489,10 +489,14 @@ enum { /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, + + /* used for bpf extension when a bpf program is loaded */ + SKBTX_BPF = 1 << 7, }; #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ - SKBTX_SCHED_TSTAMP) + SKBTX_SCHED_TSTAMP | \ + SKBTX_BPF) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) -- cgit v1.2.3 From ecebb17ad818bc043e558c278a6c56d5bbaebacc Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 20 Feb 2025 15:29:35 +0800 Subject: bpf: Add BPF_SOCK_OPS_TSTAMP_SND_SW_CB callback Support sw SCM_TSTAMP_SND case for bpf timestamping. Add a new sock_ops callback, BPF_SOCK_OPS_TSTAMP_SND_SW_CB. This callback will occur at the same timestamping point as the user space's software SCM_TSTAMP_SND. The BPF program can use it to get the same SCM_TSTAMP_SND timestamp without modifying the user-space application. Based on this patch, BPF program will get the software timestamp when the driver is ready to send the skb. In the sebsequent patch, the hardware timestamp will be supported. Signed-off-by: Jason Xing Signed-off-by: Martin KaFai Lau Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250220072940.99994-8-kerneljasonxing@gmail.com --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 52f6e033e704..76582500c5ea 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4568,7 +4568,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, static inline void skb_tx_timestamp(struct sk_buff *skb) { skb_clone_tx_timestamp(skb); - if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) + if (skb_shinfo(skb)->tx_flags & (SKBTX_SW_TSTAMP | SKBTX_BPF)) skb_tstamp_tx(skb, NULL); } -- cgit v1.2.3 From 2deaf7f42b8c551e84da20483ca2d4a65c3623b3 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 20 Feb 2025 15:29:36 +0800 Subject: bpf: Add BPF_SOCK_OPS_TSTAMP_SND_HW_CB callback Support hw SCM_TSTAMP_SND case for bpf timestamping. Add a new sock_ops callback, BPF_SOCK_OPS_TSTAMP_SND_HW_CB. This callback will occur at the same timestamping point as the user space's hardware SCM_TSTAMP_SND. The BPF program can use it to get the same SCM_TSTAMP_SND timestamp without modifying the user-space application. To avoid increasing the code complexity, replace SKBTX_HW_TSTAMP with SKBTX_HW_TSTAMP_NOBPF instead of changing numerous callers from driver side using SKBTX_HW_TSTAMP. The new definition of SKBTX_HW_TSTAMP means the combination tests of socket timestamping and bpf timestamping. After this patch, drivers can work under the bpf timestamping. Considering some drivers don't assign the skb with hardware timestamp, this patch does the assignment and then BPF program can acquire the hwstamp from skb directly. Signed-off-by: Jason Xing Signed-off-by: Martin KaFai Lau Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250220072940.99994-9-kerneljasonxing@gmail.com --- include/linux/skbuff.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 76582500c5ea..0b4f1889500d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -470,7 +470,7 @@ struct skb_shared_hwtstamps { /* Definitions for tx_flags in struct skb_shared_info */ enum { /* generate hardware time stamp */ - SKBTX_HW_TSTAMP = 1 << 0, + SKBTX_HW_TSTAMP_NOBPF = 1 << 0, /* generate software time stamp when queueing packet to NIC */ SKBTX_SW_TSTAMP = 1 << 1, @@ -494,6 +494,8 @@ enum { SKBTX_BPF = 1 << 7, }; +#define SKBTX_HW_TSTAMP (SKBTX_HW_TSTAMP_NOBPF | SKBTX_BPF) + #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP | \ SKBTX_BPF) -- cgit v1.2.3 From bb3bb6c92e5719c0f5d7adb9d34db7e76705ac33 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 19 Feb 2025 21:15:05 +0100 Subject: net: phy: remove unused feature array declarations After 12d5151be010 ("net: phy: remove leftovers from switch to linkmode bitmaps") the following declarations are unused and can be removed too. Signed-off-by: Heiner Kallweit Reviewed-by: Michal Swiatkowski Reviewed-by: Mateusz Polchlopek Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/b2883c75-4108-48f2-ab73-e81647262bc2@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 584710e084eb..13be48d3b8b3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -37,10 +37,7 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1s_p2mp_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_fibre_features) __ro_after_init; -extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_all_ports_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init; -extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init; -extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap2_features) __ro_after_init; -- cgit v1.2.3 From 97673ea38a77e42eaafcf5181c84f6c8d40b97e7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 21:48:48 +0200 Subject: gpio: regmap: Group optional assignments together for better understanding Group ngpio_per_reg, reg_stride, and reg_mask_xlate assignments together with the respective conditional for better understanding what's going on in the code. While at it, mark ngpio_per_reg as (Optional) in the kernel-doc in accordance with what code actually does. Signed-off-by: Andy Shevchenko Reviewed-by: Michael Walle Reviewed-by: Linus Walleij Tested-by: Mathieu Dubois-Briand Reviewed-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250213195621.3133406-4-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/regmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index a9f7b7faf57b..b9240e4156cc 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -30,7 +30,7 @@ struct regmap; * @reg_dir_out_base: (Optional) out setting register base address * @reg_stride: (Optional) May be set if the registers (of the * same type, dat, set, etc) are not consecutive. - * @ngpio_per_reg: Number of GPIOs per register + * @ngpio_per_reg: (Optional) Number of GPIOs per register * @irq_domain: (Optional) IRQ domain if the controller is * interrupt-capable * @reg_mask_xlate: (Optional) Translates base address and GPIO -- cgit v1.2.3 From db305161880a024a43f4b1cbafa7a294793d7a9e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 21:48:50 +0200 Subject: gpio: regmap: Allow ngpio to be read from the property GPIOLIB supports the case when number of supported GPIOs can be read from the device property. Enable this for drivers that are using GPIO regmap layer. Signed-off-by: Andy Shevchenko Reviewed-by: Michael Walle Reviewed-by: Linus Walleij Tested-by: Mathieu Dubois-Briand Reviewed-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250213195621.3133406-6-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/regmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index b9240e4156cc..c722c67668c6 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -21,7 +21,7 @@ struct regmap; * If not given, the fwnode of the parent is used. * @label: (Optional) Descriptive name for GPIO controller. * If not given, the name of the device is used. - * @ngpio: Number of GPIOs + * @ngpio: (Optional) Number of GPIOs * @names: (Optional) Array of names for gpios * @reg_dat_base: (Optional) (in) register base address * @reg_set_base: (Optional) set register base address -- cgit v1.2.3 From 5b47aba858101639a4442c2140b73f64eb796ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 4 Feb 2025 13:05:35 +0100 Subject: vdso: Introduce vdso/align.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vDSO implementation can only include headers from the vdso/ namespace. To enable the usage of the ALIGN() macro from the vDSO, move linux/align.h to vdso/align.h wholly. As the only dependency linux/const.h is only a wrapper around vdso/const.h anyways adapt that dependency. Also provide a compatibility wrapper linux/align.h. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250204-vdso-store-rng-v3-3-13a4669dfc8c@linutronix.de --- include/linux/align.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/align.h b/include/linux/align.h index 2b4acec7b95a..55debf105a5d 100644 --- a/include/linux/align.h +++ b/include/linux/align.h @@ -2,14 +2,6 @@ #ifndef _LINUX_ALIGN_H #define _LINUX_ALIGN_H -#include - -/* @a is a power of 2 value */ -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) -#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) -#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) +#include #endif /* _LINUX_ALIGN_H */ -- cgit v1.2.3 From df7fcbefa71090a276fb841ffe19b8e5f12b4767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 4 Feb 2025 13:05:37 +0100 Subject: vdso: Add generic time data storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Historically each architecture defined their own way to store the vDSO data page. Add a generic mechanism to provide storage for that page. Furthermore this generic storage will be extended to also provide uniform storage for *non*-time-related data, like the random state or architecture-specific data. These will have their own pages and data structures, so rename 'vdso_data' into 'vdso_time_data' to make that split clear from the name. Also introduce a new consistent naming scheme for the symbols related to the vDSO, which makes it clear if the symbol is accessible from userspace or kernel space and the type of data behind the symbol. The generic fault handler contains an optimization to prefault the vvar page when the timens page is accessed. This was lifted from s390 and x86. Co-developed-by: Nam Cao Signed-off-by: Nam Cao Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250204-vdso-store-rng-v3-5-13a4669dfc8c@linutronix.de --- include/linux/time_namespace.h | 1 + include/linux/vdso_datastore.h | 10 ++++++++++ 2 files changed, 11 insertions(+) create mode 100644 include/linux/vdso_datastore.h (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 876e31b4461d..4b81db223f54 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -8,6 +8,7 @@ #include #include #include +#include struct user_namespace; extern struct user_namespace init_user_ns; diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h new file mode 100644 index 000000000000..a91fa24b06e0 --- /dev/null +++ b/include/linux/vdso_datastore.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VDSO_DATASTORE_H +#define _LINUX_VDSO_DATASTORE_H + +#include + +extern const struct vm_special_mapping vdso_vvar_mapping; +struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr); + +#endif /* _LINUX_VDSO_DATASTORE_H */ -- cgit v1.2.3 From ac1a42f4e4e296b5ba5fdb39444f65d6e5196240 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 4 Feb 2025 13:05:50 +0100 Subject: vdso: Remove remnants of architecture-specific time storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All users of the time releated parts of the vDSO are now using the generic storage implementation. Remove the therefore unnecessary compatibility accessor functions and symbols. Co-developed-by: Nam Cao Signed-off-by: Nam Cao Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250204-vdso-store-rng-v3-18-13a4669dfc8c@linutronix.de --- include/linux/time_namespace.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 4b81db223f54..0b8b32bf0655 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -8,7 +8,6 @@ #include #include #include -#include struct user_namespace; extern struct user_namespace init_user_ns; @@ -166,6 +165,4 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #endif -struct vdso_data *arch_get_vdso_data(void *vvar_page); - #endif /* _LINUX_TIMENS_H */ -- cgit v1.2.3 From 8b17e540969a0983abe96ffc352397890ab67bf1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 9 Feb 2025 19:55:20 +0100 Subject: vfs: add initial support for CONFIG_DEBUG_VFS Small collection of macros taken from mmdebug.h Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250209185523.745956-2-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + include/linux/vfsdebug.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 include/linux/vfsdebug.h (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index be3ad155ec9f..d5e3fb14ad8c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H +#include #include #include #include diff --git a/include/linux/vfsdebug.h b/include/linux/vfsdebug.h new file mode 100644 index 000000000000..9cf22d3eb9dd --- /dev/null +++ b/include/linux/vfsdebug.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VFS_DEBUG_H +#define LINUX_VFS_DEBUG_H 1 + +#include + +struct inode; + +#ifdef CONFIG_DEBUG_VFS +void dump_inode(struct inode *inode, const char *reason); + +#define VFS_BUG_ON(cond) BUG_ON(cond) +#define VFS_WARN_ON(cond) (void)WARN_ON(cond) +#define VFS_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define VFS_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) +#define VFS_WARN(cond, format...) (void)WARN(cond, format) + +#define VFS_BUG_ON_INODE(cond, inode) ({ \ + if (unlikely(!!(cond))) { \ + dump_inode(inode, "VFS_BUG_ON_INODE(" #cond")");\ + BUG_ON(1); \ + } \ +}) + +#define VFS_WARN_ON_INODE(cond, inode) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_inode(inode, "VFS_WARN_ON_INODE(" #cond")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) +#else +#define VFS_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) + +#define VFS_BUG_ON_INODE(cond, inode) VFS_BUG_ON(cond) +#define VFS_WARN_ON_INODE(cond, inode) BUILD_BUG_ON_INVALID(cond) +#endif /* CONFIG_DEBUG_VFS */ + +#endif -- cgit v1.2.3 From 3eb7e95104141609d4aed15affadedb81c408f03 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 9 Feb 2025 19:55:22 +0100 Subject: vfs: use the new debug macros in inode_set_cached_link() Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250209185523.745956-4-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d5e3fb14ad8c..e71d58c7f59c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -792,6 +792,8 @@ struct inode { static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { + VFS_WARN_ON_INODE(strlen(link) != linklen, inode); + VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode); inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; -- cgit v1.2.3 From d6ff4c8f65220b20c550f12eb8921827c459600e Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 19 Jan 2025 11:32:05 +0100 Subject: fs: avoid mmap sem relocks when coredumping with many missing pages Dumping processes with large allocated and mostly not-faulted areas is very slow. Borrowing a test case from Tavian Barnes: int main(void) { char *mem = mmap(NULL, 1ULL << 40, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); printf("%p %m\n", mem); if (mem != MAP_FAILED) { mem[0] = 1; } abort(); } That's 1TB of almost completely not-populated area. On my test box it takes 13-14 seconds to dump. The profile shows: - 99.89% 0.00% a.out entry_SYSCALL_64_after_hwframe do_syscall_64 syscall_exit_to_user_mode arch_do_signal_or_restart - get_signal - 99.89% do_coredump - 99.88% elf_core_dump - dump_user_range - 98.12% get_dump_page - 64.19% __get_user_pages - 40.92% gup_vma_lookup - find_vma - mt_find 4.21% __rcu_read_lock 1.33% __rcu_read_unlock - 3.14% check_vma_flags 0.68% vma_is_secretmem 0.61% __cond_resched 0.60% vma_pgtable_walk_end 0.59% vma_pgtable_walk_begin 0.58% no_page_table - 15.13% down_read_killable 0.69% __cond_resched 13.84% up_read 0.58% __cond_resched Almost 29% of the time is spent relocking the mmap semaphore between calls to get_dump_page() which find nothing. Whacking that results in times of 10 seconds (down from 13-14). While here make the thing killable. The real problem is the page-sized iteration and the real fix would patch it up instead. It is left as an exercise for the mm-familiar reader. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250119103205.2172432-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..78f9e12cc861 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2549,7 +2549,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; -struct page *get_dump_page(unsigned long addr); +struct page *get_dump_page(unsigned long addr, int *locked); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); -- cgit v1.2.3 From 29d80d506b18384f64a54b01fae78184e2d327f3 Mon Sep 17 00:00:00 2001 From: Yuichiro Tsuji Date: Tue, 21 Jan 2025 16:08:22 +0900 Subject: open: Fix return type of several functions from long to int Fix the return type of several functions from long to int to match its actu al behavior. These functions only return int values. This change improves type consistency across the filesystem code and aligns the function signatu re with its existing implementation and usage. Reviewed-by: Jan Kara Signed-off-by: Yuichiro Tsuji Link: https://lore.kernel.org/r/20250121070844.4413-2-yuichtsu@amazon.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++--- include/linux/syscalls.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e71d58c7f59c..d1f1673956e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2785,13 +2785,13 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) return mnt_idmap(mnt) != &nop_mnt_idmap; } -extern long vfs_truncate(const struct path *, loff_t); +int vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); -extern long do_sys_open(int dfd, const char __user *filename, int flags, - umode_t mode); +int do_sys_open(int dfd, const char __user *filename, int flags, + umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c6333204d451..bae4490c1dda 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1266,14 +1266,14 @@ static inline long ksys_lchown(const char __user *filename, uid_t user, AT_SYMLINK_NOFOLLOW); } -extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +int do_sys_ftruncate(unsigned int fd, loff_t length, int small); static inline long ksys_ftruncate(unsigned int fd, loff_t length) { return do_sys_ftruncate(fd, length, 1); } -extern long do_sys_truncate(const char __user *pathname, loff_t length); +int do_sys_truncate(const char __user *pathname, loff_t length); static inline long ksys_truncate(const char __user *pathname, loff_t length) { -- cgit v1.2.3 From f326565c44419380d18290edee5f78921418f7a5 Mon Sep 17 00:00:00 2001 From: Yuichiro Tsuji Date: Tue, 21 Jan 2025 16:08:23 +0900 Subject: ioctl: Fix return type of several functions from long to int Fix the return type of several functions from long to int to match its actu al behavior. These functions only return int values. This change improves type consistency across the filesystem code and aligns the function signatu re with its existing implementation and usage. Reviewed-by: Jan Kara Signed-off-by: Yuichiro Tsuji Link: https://lore.kernel.org/r/20250121070844.4413-3-yuichtsu@amazon.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d1f1673956e2..c1763b022d06 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2030,7 +2030,7 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, -- cgit v1.2.3 From 1bb772565f327291b0a463b125c7646dc45ae8b4 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 6 Feb 2025 01:01:05 +0100 Subject: vfs: inline getname() It is merely a trivial wrapper around getname_flags which adds a zeroed argument, no point paying for an extra call. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250206000105.432528-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c1763b022d06..3e07e4a44de6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2842,7 +2842,10 @@ extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); -extern struct filename *getname(const char __user *); +static inline struct filename *getname(const char __user *name) +{ + return getname_flags(name, 0); +} extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) -- cgit v1.2.3 From 1479be62582dbd2078390ef609f8f5ef351c15e8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 12 Feb 2025 19:04:59 +0100 Subject: vfs: inline new_inode_pseudo() and de-staticize alloc_inode() The former is a no-op wrapper with the same argument. I left it in place to not lose the information who needs it -- one day "pseudo" inodes may start differing from what alloc_inode() returns. In the meantime no point taking a detour. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250212180459.1022983-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3e07e4a44de6..70c985f3d787 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3285,7 +3285,11 @@ static inline void __iget(struct inode *inode) extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); -extern struct inode *new_inode_pseudo(struct super_block *sb); +struct inode *alloc_inode(struct super_block *sb); +static inline struct inode *new_inode_pseudo(struct super_block *sb) +{ + return alloc_inode(sb); +} extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); -- cgit v1.2.3 From 448fa70158f9b348e71869cfe4a31988e07b20b2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 20 Feb 2025 17:39:41 +0100 Subject: sysv: Remove the filesystem Since 2002 (change "Replace BKL for chain locking with sysvfs-private rwlock") the sysv filesystem was doing IO under a rwlock in its get_block() function (yes, a non-sleepable lock hold over a function used to read inode metadata for all reads and writes). Nobody noticed until syzbot in 2023 [1]. This shows nobody is using the filesystem. Just drop it. [1] https://lore.kernel.org/all/0000000000000ccf9a05ee84f5b0@google.com/ Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20250220163940.10155-2-jack@suse.cz Reviewed-by: Jeff Layton Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/sysv_fs.h | 214 ------------------------------------------------ 1 file changed, 214 deletions(-) delete mode 100644 include/linux/sysv_fs.h (limited to 'include/linux') diff --git a/include/linux/sysv_fs.h b/include/linux/sysv_fs.h deleted file mode 100644 index 5cf77dbb8d86..000000000000 --- a/include/linux/sysv_fs.h +++ /dev/null @@ -1,214 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SYSV_FS_H -#define _LINUX_SYSV_FS_H - -#define __packed2__ __attribute__((packed, aligned(2))) - - -#ifndef __KERNEL__ -typedef u16 __fs16; -typedef u32 __fs16; -#endif - -/* inode numbers are 16 bit */ -typedef __fs16 sysv_ino_t; - -/* Block numbers are 24 bit, sometimes stored in 32 bit. - On Coherent FS, they are always stored in PDP-11 manner: the least - significant 16 bits come last. */ -typedef __fs32 sysv_zone_t; - -/* 0 is non-existent */ -#define SYSV_BADBL_INO 1 /* inode of bad blocks file */ -#define SYSV_ROOT_INO 2 /* inode of root directory */ - - -/* Xenix super-block data on disk */ -#define XENIX_NICINOD 100 /* number of inode cache entries */ -#define XENIX_NICFREE 100 /* number of free block list chunk entries */ -struct xenix_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= XENIX_NICFREE */ - sysv_zone_t s_free[XENIX_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= XENIX_NICINOD */ - sysv_ino_t s_inode[XENIX_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - __fs16 s_dinfo[4]; /* device information ?? */ - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - char s_clean; /* set to 0x46 when filesystem is properly unmounted */ - char s_fill[371]; - s32 s_magic; /* version of file system */ - __fs32 s_type; /* type of file system: 1 for 512 byte blocks - 2 for 1024 byte blocks - 3 for 2048 byte blocks */ - -}; - -/* - * SystemV FS comes in two variants: - * sysv2: System V Release 2 (e.g. Microport), structure elements aligned(2). - * sysv4: System V Release 4 (e.g. Consensys), structure elements aligned(4). - */ -#define SYSV_NICINOD 100 /* number of inode cache entries */ -#define SYSV_NICFREE 50 /* number of free block list chunk entries */ - -/* SystemV4 super-block data on disk */ -struct sysv4_super_block { - __fs16 s_isize; /* index of first data zone */ - u16 s_pad0; - __fs32 s_fsize; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= SYSV_NICFREE */ - u16 s_pad1; - sysv_zone_t s_free[SYSV_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= SYSV_NICINOD */ - u16 s_pad2; - sysv_ino_t s_inode[SYSV_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time; /* time of last super block update */ - __fs16 s_dinfo[4]; /* device information ?? */ - __fs32 s_tfree; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - u16 s_pad3; - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - s32 s_fill[12]; - __fs32 s_state; /* file system state: 0x7c269d38-s_time means clean */ - s32 s_magic; /* version of file system */ - __fs32 s_type; /* type of file system: 1 for 512 byte blocks - 2 for 1024 byte blocks */ -}; - -/* SystemV2 super-block data on disk */ -struct sysv2_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= SYSV_NICFREE */ - sysv_zone_t s_free[SYSV_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= SYSV_NICINOD */ - sysv_ino_t s_inode[SYSV_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - __fs16 s_dinfo[4]; /* device information ?? */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - s32 s_fill[14]; - __fs32 s_state; /* file system state: 0xcb096f43 means clean */ - s32 s_magic; /* version of file system */ - __fs32 s_type; /* type of file system: 1 for 512 byte blocks - 2 for 1024 byte blocks */ -}; - -/* V7 super-block data on disk */ -#define V7_NICINOD 100 /* number of inode cache entries */ -#define V7_NICFREE 50 /* number of free block list chunk entries */ -struct v7_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= V7_NICFREE */ - sysv_zone_t s_free[V7_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= V7_NICINOD */ - sysv_ino_t s_inode[V7_NICINOD]; /* some free inodes */ - /* locks, not used by Linux or V7: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - /* the following fields are not maintained by V7: */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - __fs16 s_m; /* interleave factor */ - __fs16 s_n; /* interleave factor */ - char s_fname[6]; /* file system name */ - char s_fpack[6]; /* file system pack name */ -}; -/* Constants to aid sanity checking */ -/* This is not a hard limit, nor enforced by v7 kernel. It's actually just - * the limit used by Seventh Edition's ls, though is high enough to assume - * that no reasonable file system would have that much entries in root - * directory. Thus, if we see anything higher, we just probably got the - * endiannes wrong. */ -#define V7_NFILES 1024 -/* The disk addresses are three-byte (despite direct block addresses being - * aligned word-wise in inode). If the most significant byte is non-zero, - * something is most likely wrong (not a filesystem, bad bytesex). */ -#define V7_MAXSIZE 0x00ffffff - -/* Coherent super-block data on disk */ -#define COH_NICINOD 100 /* number of inode cache entries */ -#define COH_NICFREE 64 /* number of free block list chunk entries */ -struct coh_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= COH_NICFREE */ - sysv_zone_t s_free[COH_NICFREE] __packed2__; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= COH_NICINOD */ - sysv_ino_t s_inode[COH_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - __fs16 s_interleave_m; /* interleave factor */ - __fs16 s_interleave_n; - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - __fs32 s_unique; /* zero, not used */ -}; - -/* SystemV/Coherent inode data on disk */ -struct sysv_inode { - __fs16 i_mode; - __fs16 i_nlink; - __fs16 i_uid; - __fs16 i_gid; - __fs32 i_size; - u8 i_data[3*(10+1+1+1)]; - u8 i_gen; - __fs32 i_atime; /* time of last access */ - __fs32 i_mtime; /* time of last modification */ - __fs32 i_ctime; /* time of creation */ -}; - -/* SystemV/Coherent directory entry on disk */ -#define SYSV_NAMELEN 14 /* max size of name in struct sysv_dir_entry */ -struct sysv_dir_entry { - sysv_ino_t inode; - char name[SYSV_NAMELEN]; /* up to 14 characters, the rest are zeroes */ -}; - -#define SYSV_DIRSIZE sizeof(struct sysv_dir_entry) /* size of every directory entry */ - -#endif /* _LINUX_SYSV_FS_H */ -- cgit v1.2.3 From c749f058b4371430a8338e1ca72b9ae38fef613b Mon Sep 17 00:00:00 2001 From: Kannappan R Date: Thu, 20 Feb 2025 16:13:39 +0200 Subject: USB: core: Add eUSB2 descriptor and parsing in USB core Add support for the 'eUSB2 Isochronous Endpoint Companion Descriptor' introduced in the recent USB 2.0 specification 'USB 2.0 Double Isochronous IN Bandwidth' ECN. It allows embedded USB2 (eUSB2) devices to report and use higher bandwidths for isochronous IN transfers in order to support higher camera resolutions on the lid of laptops and tablets with minimal change to the USB2 protocol. The motivation for expanding USB 2.0 is further clarified in an additional Embedded USB2 version 2.0 (eUSB2v2) supplement to the USB 2.0 specification. It points out this is optimized for performance, power and cost by using the USB 2.0 low-voltage, power efficient PHY and half-duplex link for the asymmetric camera bandwidth needs, avoiding the costly and complex full-duplex USB 3.x symmetric link and gigabit receivers. eUSB2 devices that support the higher isochronous IN bandwidth and the new descriptor can be identified by their device descriptor bcdUSB value of 0x0220 Co-developed-by: Amardeep Rai Signed-off-by: Amardeep Rai Signed-off-by: Kannappan R Co-developed-by: Mathias Nyman Signed-off-by: Mathias Nyman Acked-by: Alan Stern Link: https://lore.kernel.org/r/20250220141339.1939448-1-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index cfa8005e24f9..b46738701f8d 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -51,6 +51,7 @@ struct ep_device; * @desc: descriptor for this endpoint, wMaxPacketSize in native byteorder * @ss_ep_comp: SuperSpeed companion descriptor for this endpoint * @ssp_isoc_ep_comp: SuperSpeedPlus isoc companion descriptor for this endpoint + * @eusb2_isoc_ep_comp: eUSB2 isoc companion descriptor for this endpoint * @urb_list: urbs queued to this endpoint; maintained by usbcore * @hcpriv: for use by HCD; typically holds hardware dma queue head (QH) * with one or more transfer descriptors (TDs) per urb @@ -64,9 +65,10 @@ struct ep_device; * descriptor within an active interface in a given USB configuration. */ struct usb_host_endpoint { - struct usb_endpoint_descriptor desc; - struct usb_ss_ep_comp_descriptor ss_ep_comp; - struct usb_ssp_isoc_ep_comp_descriptor ssp_isoc_ep_comp; + struct usb_endpoint_descriptor desc; + struct usb_ss_ep_comp_descriptor ss_ep_comp; + struct usb_ssp_isoc_ep_comp_descriptor ssp_isoc_ep_comp; + struct usb_eusb2_isoc_ep_comp_descriptor eusb2_isoc_ep_comp; struct list_head urb_list; void *hcpriv; struct ep_device *ep_dev; /* For sysfs info */ -- cgit v1.2.3 From ac9c5170a18162d45c6edd1f0fa2d2b2504bc2cb Mon Sep 17 00:00:00 2001 From: Subramanian Mohan Date: Wed, 19 Feb 2025 09:36:15 +0530 Subject: pps: generators: replace copy of pps-gen info struct with const pointer Some PPS generator drivers may need to retrieve a pointer to their internal data while executing the PPS generator enable() method. During the driver registration the pps_gen_device pointer is returned from the framework, and for that reason, there is difficulty in getting generator driver data back in the enable function. We won't be able to use container_of macro as it results in static assert, and we might end up in using static pointer. To solve the issue and to get back the generator driver data back, we should not copy the struct pps_gen_source_info within the struct pps_gen_device during the registration stage, but simply save the pointer of the driver one. In this manner, driver may get a pointer to their internal data as shown below: struct pps_gen_foo_data_s { ... struct pps_gen_source_info gen_info; struct pps_gen_device *pps_gen; ... }; static int __init pps_gen_foo_init(void) { struct pps_gen_foo_data_s *foo; ... foo->pps_gen = pps_gen_register_source(&foo->gen_info); ... } Then, in the enable() method, we can retrieve the pointer to the main struct by using the code below: static int pps_gen_foo_enable(struct pps_gen_device *pps_gen, bool enable) { struct pps_gen_foo_data_s *foo = container_of(pps_gen->info, struct pps_gen_foo_data_s, gen_info); ... } Signed-off-by: Rodolfo Giometti Tested-by: Subramanian Mohan Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Signed-off-by: Subramanian Mohan Link: https://lore.kernel.org/r/20250219040618.70962-2-subramanian.mohan@intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/pps_gen_kernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pps_gen_kernel.h b/include/linux/pps_gen_kernel.h index 022ea0ac4440..6214c8aa2e02 100644 --- a/include/linux/pps_gen_kernel.h +++ b/include/linux/pps_gen_kernel.h @@ -43,7 +43,7 @@ struct pps_gen_source_info { /* The main struct */ struct pps_gen_device { - struct pps_gen_source_info info; /* PSS generator info */ + const struct pps_gen_source_info *info; /* PSS generator info */ bool enabled; /* PSS generator status */ unsigned int event; @@ -70,7 +70,7 @@ extern const struct attribute_group *pps_gen_groups[]; */ extern struct pps_gen_device *pps_gen_register_source( - struct pps_gen_source_info *info); + const struct pps_gen_source_info *info); extern void pps_gen_unregister_source(struct pps_gen_device *pps_gen); extern void pps_gen_event(struct pps_gen_device *pps_gen, unsigned int event, void *data); -- cgit v1.2.3 From 8510edf191d2df0822ea22d6226e4eef87562271 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 18 Feb 2025 20:02:08 +0800 Subject: mm/filemap: fix miscalculated file range for filemap_fdatawrite_range_kick() iocb->ki_pos has been updated with the number of written bytes since generic_perform_write(). Besides __filemap_fdatawrite_range() accepts the inclusive end of the data range. Fixes: 1d4457576570 ("mm: call filemap_fdatawrite_range_kick() after IOCB_DONTCACHE issue") Signed-off-by: Jingbo Xu Link: https://lore.kernel.org/r/20250218120209.88093-2-jefflexu@linux.alibaba.com Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9346adf28f7b..2788df98080f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2975,8 +2975,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, - iocb->ki_pos + count); + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; -- cgit v1.2.3 From 8aeacf257070469ff78a998a968a61d0cadc0de3 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 18 Feb 2025 10:56:21 +0100 Subject: perf/core: Move perf_event sysctls into kernel/events Move ctl tables to two files: - perf_event_{paranoid,mlock_kb,max_sample_rate} and perf_cpu_time_max_percent into kernel/events/core.c - perf_event_max_{stack,context_per_stack} into kernel/events/callchain.c Make static variables and functions that are fully contained in core.c and callchain.cand remove them from include/linux/perf_event.h. Additionally six_hundred_forty_kb is moved to callchain.c. Two new sysctl tables are added ({callchain,events_core}_sysctl_table) with their respective sysctl registration functions. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kerenel/sysctl.c. Signed-off-by: Joel Granados Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250218-jag-mv_ctltables-v1-5-cd3698ab8d29@kernel.org --- include/linux/perf_event.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2d07bc1193f3..c4525bae2fe9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1652,19 +1652,10 @@ static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 } extern int sysctl_perf_event_paranoid; -extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; -extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int perf_event_max_stack_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); - /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 -- cgit v1.2.3 From 1f7df3a691740a7736bbc99dc4ed536120eb4746 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:36 -0800 Subject: genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie The IOMMU translation for MSI message addresses has been a 2-step process, separated in time: 1) iommu_dma_prepare_msi(): A cookie pointer containing the IOVA address is stored in the MSI descriptor when an MSI interrupt is allocated. 2) iommu_dma_compose_msi_msg(): this cookie pointer is used to compute a translated message address. This has an inherent lifetime problem for the pointer stored in the cookie that must remain valid between the two steps. However, there is no locking at the irq layer that helps protect the lifetime. Today, this works under the assumption that the iommu domain is not changed while MSI interrupts being programmed. This is true for normal DMA API users within the kernel, as the iommu domain is attached before the driver is probed and cannot be changed while a driver is attached. Classic VFIO type1 also prevented changing the iommu domain while VFIO was running as it does not support changing the "container" after starting up. However, iommufd has improved this so that the iommu domain can be changed during VFIO operation. This potentially allows userspace to directly race VFIO_DEVICE_ATTACH_IOMMUFD_PT (which calls iommu_attach_group()) and VFIO_DEVICE_SET_IRQS (which calls into iommu_dma_compose_msi_msg()). This potentially causes both the cookie pointer and the unlocked call to iommu_get_domain_for_dev() on the MSI translation path to become UAFs. Fix the MSI cookie UAF by removing the cookie pointer. The translated IOVA address is already known during iommu_dma_prepare_msi() and cannot change. Thus, it can simply be stored as an integer in the MSI descriptor. The other UAF related to iommu_get_domain_for_dev() will be addressed in patch "iommu: Make iommu_dma_prepare_msi() into a generic operation" by using the IOMMU group mutex. Link: https://patch.msgid.link/r/a4f2cd76b9dc1833ee6c1cf325cba57def22231c.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe --- include/linux/msi.h | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index b10093c4d00e..fc4f3774c3af 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -166,6 +166,10 @@ struct msi_desc_data { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor + * @iommu_msi_iova: Optional shifted IOVA from the IOMMU to override the msi_addr. + * Only used if iommu_msi_shift != 0 + * @iommu_msi_shift: Indicates how many bits of the original address should be + * preserved when using iommu_msi_iova. * @sysfs_attr: Pointer to sysfs device attribute * * @write_msi_msg: Callback that may be called when the MSI message @@ -184,7 +188,8 @@ struct msi_desc { struct msi_msg msg; struct irq_affinity_desc *affinity; #ifdef CONFIG_IRQ_MSI_IOMMU - const void *iommu_cookie; + u64 iommu_msi_iova : 58; + u64 iommu_msi_shift : 6; #endif #ifdef CONFIG_SYSFS struct device_attribute *sysfs_attrs; @@ -285,28 +290,14 @@ struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid, #define msi_desc_to_dev(desc) ((desc)->dev) -#ifdef CONFIG_IRQ_MSI_IOMMU -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) -{ - return desc->iommu_cookie; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ - desc->iommu_cookie = iommu_cookie; -} -#else -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) +static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_iova, + unsigned int msi_shift) { - return NULL; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ -} +#ifdef CONFIG_IRQ_MSI_IOMMU + desc->iommu_msi_iova = msi_iova >> msi_shift; + desc->iommu_msi_shift = msi_shift; #endif +} int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); -- cgit v1.2.3 From 9349887e93009331e751854843f73a086bef4018 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:37 -0800 Subject: genirq/msi: Refactor iommu_dma_compose_msi_msg() The two-step process to translate the MSI address involves two functions, iommu_dma_prepare_msi() and iommu_dma_compose_msi_msg(). Previously iommu_dma_compose_msi_msg() needed to be in the iommu layer as it had to dereference the opaque cookie pointer. Now, the previous patch changed the cookie pointer into an integer so there is no longer any need for the iommu layer to be involved. Further, the call sites of iommu_dma_compose_msi_msg() all follow the same pattern of setting an MSI message address_hi/lo to non-translated and then immediately calling iommu_dma_compose_msi_msg(). Refactor iommu_dma_compose_msi_msg() into msi_msg_set_addr() that directly accepts the u64 version of the address and simplifies all the callers. Move the new helper to linux/msi.h since it has nothing to do with iommu. Aside from refactoring, this logically prepares for the next patch, which allows multiple implementation options for iommu_dma_prepare_msi(). So, it does not make sense to have the iommu_dma_compose_msi_msg() in dma-iommu.c as it no longer provides the only iommu_dma_prepare_msi() implementation. Link: https://patch.msgid.link/r/eda62a9bafa825e9cdabd7ddc61ad5a21c32af24.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 6 ------ include/linux/msi.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 38c65e92ecd0..caee952febd4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1508,7 +1508,6 @@ static inline void iommu_debugfs_setup(void) {} int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg); #else /* CONFIG_IOMMU_DMA */ @@ -1524,11 +1523,6 @@ static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_a { return 0; } - -static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -} - #endif /* CONFIG_IOMMU_DMA */ /* diff --git a/include/linux/msi.h b/include/linux/msi.h index fc4f3774c3af..8d97b890faec 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -299,6 +299,34 @@ static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_io #endif } +/** + * msi_msg_set_addr() - Set MSI address in an MSI message + * + * @desc: MSI descriptor that may carry an IOVA base address for MSI via @iommu_msi_iova/shift + * @msg: Target MSI message to set its address_hi and address_lo + * @msi_addr: Physical address to set the MSI message + * + * Notes: + * - Override @msi_addr using the IOVA base address in the @desc if @iommu_msi_shift is set + * - Otherwise, simply set @msi_addr to @msg + */ +static inline void msi_msg_set_addr(struct msi_desc *desc, struct msi_msg *msg, + phys_addr_t msi_addr) +{ +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; + + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msi_addr & ((1 << desc->iommu_msi_shift) - 1)); + return; + } +#endif + msg->address_hi = upper_32_bits(msi_addr); + msg->address_lo = lower_32_bits(msi_addr); +} + int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); /** -- cgit v1.2.3 From 288683c92b1abc32277c83819bea287af614d239 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:38 -0800 Subject: iommu: Make iommu_dma_prepare_msi() into a generic operation SW_MSI supports IOMMU to translate an MSI message before the MSI message is delivered to the interrupt controller. On such systems, an iommu_domain must have a translation for the MSI message for interrupts to work. The IRQ subsystem will call into IOMMU to request that a physical page be set up to receive MSI messages, and the IOMMU then sets an IOVA that maps to that physical page. Ultimately the IOVA is programmed into the device via the msi_msg. Generalize this by allowing iommu_domain owners to provide implementations of this mapping. Add a function pointer in struct iommu_domain to allow a domain owner to provide its own implementation. Have dma-iommu supply its implementation for IOMMU_DOMAIN_DMA types during the iommu_get_dma_cookie() path. For IOMMU_DOMAIN_UNMANAGED types used by VFIO (and iommufd for now), have the same iommu_dma_sw_msi set as well in the iommu_get_msi_cookie() path. Hold the group mutex while in iommu_dma_prepare_msi() to ensure the domain doesn't change or become freed while running. Races with IRQ operations from VFIO and domain changes from iommufd are possible here. Replace the msi_prepare_lock with a lockdep assertion for the group mutex as documentation. For the dmau_iommu.c each iommu_domain is unique to a group. Link: https://patch.msgid.link/r/4ca696150d2baee03af27c4ddefdb7b0b0280e7b.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index caee952febd4..761c5e186de9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -44,6 +44,8 @@ struct iommu_dma_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; +struct msi_desc; +struct msi_msg; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -216,6 +218,12 @@ struct iommu_domain { struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); +#endif + void *fault_data; union { struct { @@ -234,6 +242,16 @@ struct iommu_domain { }; }; +static inline void iommu_domain_set_sw_msi( + struct iommu_domain *domain, + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr)) +{ +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + domain->sw_msi = sw_msi; +#endif +} + static inline bool iommu_is_dma_domain(struct iommu_domain *domain) { return domain->type & __IOMMU_DOMAIN_DMA_API; @@ -1470,6 +1488,18 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) static inline void iommu_free_global_pasid(ioasid_t pasid) {} #endif /* CONFIG_IOMMU_API */ +#ifdef CONFIG_IRQ_MSI_IOMMU +#ifdef CONFIG_IOMMU_API +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); +#else +static inline int iommu_dma_prepare_msi(struct msi_desc *desc, + phys_addr_t msi_addr) +{ + return 0; +} +#endif /* CONFIG_IOMMU_API */ +#endif /* CONFIG_IRQ_MSI_IOMMU */ + #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) void iommu_group_mutex_assert(struct device *dev); #else @@ -1503,26 +1533,12 @@ static inline void iommu_debugfs_setup(void) {} #endif #ifdef CONFIG_IOMMU_DMA -#include - int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); - -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); - #else /* CONFIG_IOMMU_DMA */ - -struct msi_desc; -struct msi_msg; - static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { return -ENODEV; } - -static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) -{ - return 0; -} #endif /* CONFIG_IOMMU_DMA */ /* -- cgit v1.2.3 From 748706d7ca06012621b32851b68136bf33613b9e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 19 Feb 2025 17:31:40 -0800 Subject: iommu: Turn fault_data to iommufd private pointer A "fault_data" was added exclusively for the iommufd_fault_iopf_handler() used by IOPF/PRI use cases, along with the attach_handle. Now, the iommufd version of the sw_msi function will reuse the attach_handle and fault_data for a non-fault case. Rename "fault_data" to "iommufd_hwpt" so as not to confine it to a "fault" case. Move it into a union to be the iommufd private pointer. A following patch will move the iova_cookie to the union for dma-iommu too after the iommufd_sw_msi implementation is added. Since we have two unions now, add some simple comments for readability. Link: https://patch.msgid.link/r/ee5039503f28a16590916e9eef28b917e2d1607a.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 761c5e186de9..e93d2e918599 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -224,8 +224,10 @@ struct iommu_domain { phys_addr_t msi_addr); #endif - void *fault_data; - union { + union { /* Pointer usable by owner of the domain */ + struct iommufd_hw_pagetable *iommufd_hwpt; /* iommufd */ + }; + union { /* Fault handler */ struct { iommu_fault_handler_t handler; void *handler_token; -- cgit v1.2.3 From 46006ceb5d029f92df405db15c9a31f0ee41628c Mon Sep 17 00:00:00 2001 From: Linu Cherian Date: Wed, 12 Feb 2025 17:19:13 +0530 Subject: coresight: core: Add provision for panic callbacks Panic callback handlers allows coresight device drivers to sync relevant trace data and trace metadata to reserved memory regions so that they can be retrieved later in the subsequent boot or in the crashdump kernel. Signed-off-by: Linu Cherian Reviewed-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250212114918.548431-4-lcherian@marvell.com --- include/linux/coresight.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 17276965ff1d..3eef4e91df3f 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -340,6 +340,7 @@ enum cs_mode { #define link_ops(csdev) csdev->ops->link_ops #define helper_ops(csdev) csdev->ops->helper_ops #define ect_ops(csdev) csdev->ops->ect_ops +#define panic_ops(csdev) csdev->ops->panic_ops /** * struct coresight_ops_sink - basic operations for a sink @@ -409,11 +410,22 @@ struct coresight_ops_helper { int (*disable)(struct coresight_device *csdev, void *data); }; + +/** + * struct coresight_ops_panic - Generic device ops for panic handing + * + * @sync : Sync the device register state/trace data + */ +struct coresight_ops_panic { + int (*sync)(struct coresight_device *csdev); +}; + struct coresight_ops { const struct coresight_ops_sink *sink_ops; const struct coresight_ops_link *link_ops; const struct coresight_ops_source *source_ops; const struct coresight_ops_helper *helper_ops; + const struct coresight_ops_panic *panic_ops; }; static inline u32 csdev_access_relaxed_read32(struct csdev_access *csa, -- cgit v1.2.3 From 69c7be1b903fca2835e80ec506bd1d75ce84fb4d Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Wed, 19 Feb 2025 20:50:28 +0800 Subject: rtnetlink: Pack newlink() params into struct There are 4 net namespaces involved when creating links: - source netns - where the netlink socket resides, - target netns - where to put the device being created, - link netns - netns associated with the device (backend), - peer netns - netns of peer device. Currently, two nets are passed to newlink() callback - "src_net" parameter and "dev_net" (implicitly in net_device). They are set as follows, depending on netlink attributes in the request. +------------+-------------------+---------+---------+ | peer netns | IFLA_LINK_NETNSID | src_net | dev_net | +------------+-------------------+---------+---------+ | | absent | source | target | | absent +-------------------+---------+---------+ | | present | link | link | +------------+-------------------+---------+---------+ | | absent | peer | target | | present +-------------------+---------+---------+ | | present | peer | link | +------------+-------------------+---------+---------+ When IFLA_LINK_NETNSID is present, the device is created in link netns first and then moved to target netns. This has some side effects, including extra ifindex allocation, ifname validation and link events. These could be avoided if we create it in target netns from the beginning. On the other hand, the meaning of src_net parameter is ambiguous. It varies depending on how parameters are passed. It is the effective link (or peer netns) by design, but some drivers ignore it and use dev_net instead. To provide more netns context for drivers, this patch packs existing newlink() parameters, along with the source netns, link netns and peer netns, into a struct. The old "src_net" is renamed to "net" to avoid confusion with real source netns, and will be deprecated later. The use of src_net are converted to params->net trivially. Signed-off-by: Xiao Liang Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250219125039.18024-3-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/if_macvlan.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 523025106a64..0f7281e3e448 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -59,8 +59,10 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan, extern void macvlan_common_setup(struct net_device *dev); -extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +struct rtnl_newlink_params; + +extern int macvlan_common_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); extern void macvlan_dellink(struct net_device *dev, struct list_head *head); -- cgit v1.2.3 From 79c731e20d7401cc39e4f0e633b135b966a04f62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 7 Feb 2025 18:18:35 +0200 Subject: PCI: Track Flit Mode Status & print it with link status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe r6.0 added Flit mode, which mainly alters HW behavior, but there are some OS visible changes. The OS visible changes include differences in the layout of some capabilities and interpretation of the TLP headers (in diagnostics situations). To be able to determine which mode the PCIe Link is using, store the Flit Mode Status (PCIe r6.1 sec 7.5.3.20) information in addition to the Link speed into struct pci_bus in pcie_update_link_speed(). Link: https://lore.kernel.org/r/20250207161836.2755-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen [bhelgaas: use unsigned int:1 instead of bool, update flit_mode setting] Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 47b31ad724fa..e4bf67bf8172 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -681,6 +681,7 @@ struct pci_bus { struct bin_attribute *legacy_mem; /* Legacy mem */ unsigned int is_added:1; unsigned int unsafe_warn:1; /* warned about RW1C config write */ + unsigned int flit_mode:1; /* Link in Flit mode */ }; #define to_pci_bus(n) container_of(n, struct pci_bus, dev) -- cgit v1.2.3 From 7e077e6707b3428562ba30d883ff8f54e98dc18b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 7 Feb 2025 18:18:36 +0200 Subject: PCI/ERR: Handle TLP Log in Flit mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flit mode introduced in PCIe r6.0 alters how the TLP Header Log is presented through AER and DPC Capability registers. The TLP Prefix Log Register is not present with Flit mode, and the register becomes an extension of the TLP Header Log (PCIe r6.1 secs 7.8.4.12 & 7.9.14.13). Adapt pcie_read_tlp_log() and struct pcie_tlp_log to read and store the extended TLP Header Log when the Link is in Flit mode. As the Prefix Log and Extended TLP Header are not present at the same time, a C union can be used. Determining whether the error occurred while the Link was in Flit mode is a bit complicated. In case of AER, the Advanced Error Capabilities and Control Register directly tells whether the error was logged in Flit mode or not (PCIe r6.1 sec 7.8.4.7). The DPC Capability (PCIe r6.1 sec 7.9.14), unfortunately, does not contain the same information. Unlike AER, the DPC Capability does not provide a way to discern whether the error was logged in Flit mode (this is confirmed by PCI WG to be an oversight in the spec). DPC will bring the Link down immediately following an error, which makes it impossible to acquire the Flit Mode Status directly from the Link Status 2 register because Flit Mode Status is only set in certain Link states (PCIe r6.1 sec 7.5.3.20). As a workaround, use the flit_mode value stored into the struct pci_bus. Link: https://lore.kernel.org/r/20250207161836.2755-3-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas --- include/linux/aer.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index 947b63091902..02940be66324 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -22,12 +22,20 @@ */ #define PCIE_STD_NUM_TLP_HEADERLOG 4 #define PCIE_STD_MAX_TLP_PREFIXLOG 4 +#define PCIE_STD_MAX_TLP_HEADERLOG (PCIE_STD_NUM_TLP_HEADERLOG + 10) struct pci_dev; struct pcie_tlp_log { - u32 dw[PCIE_STD_NUM_TLP_HEADERLOG]; - u32 prefix[PCIE_STD_MAX_TLP_PREFIXLOG]; + union { + u32 dw[PCIE_STD_MAX_TLP_HEADERLOG]; + struct { + u32 _do_not_use[PCIE_STD_NUM_TLP_HEADERLOG]; + u32 prefix[PCIE_STD_MAX_TLP_PREFIXLOG]; + }; + }; + u8 header_len; /* Length of the Logged TLP Header in DWORDs */ + bool flit; /* TLP was logged when in Flit mode */ }; struct aer_capability_regs { -- cgit v1.2.3 From dcc35baae732b9079b2c6595cfd86da02b34a4e6 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 21 Feb 2025 08:56:57 +0800 Subject: usb: Add base USB MCTP definitions Upcoming changes will add a USB host (and later gadget) driver for the MCTP-over-USB protocol. Add a header that provides common definitions for protocol support: the packet header format and a few framing definitions. Add a define for the MCTP class code, as per https://usb.org/defined-class-codes. Signed-off-by: Jeremy Kerr Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250221-dev-mctp-usb-v3-1-3353030fe9cc@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- include/linux/usb/mctp-usb.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 include/linux/usb/mctp-usb.h (limited to 'include/linux') diff --git a/include/linux/usb/mctp-usb.h b/include/linux/usb/mctp-usb.h new file mode 100644 index 000000000000..a2f6f1e04efb --- /dev/null +++ b/include/linux/usb/mctp-usb.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * mctp-usb.h - MCTP USB transport binding: common definitions, + * based on DMTF0283 specification: + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0283_1.0.1.pdf + * + * These are protocol-level definitions, that may be shared between host + * and gadget drivers. + * + * Copyright (C) 2024-2025 Code Construct Pty Ltd + */ + +#ifndef __LINUX_USB_MCTP_USB_H +#define __LINUX_USB_MCTP_USB_H + +#include + +struct mctp_usb_hdr { + __be16 id; + u8 rsvd; + u8 len; +} __packed; + +#define MCTP_USB_XFER_SIZE 512 +#define MCTP_USB_BTU 68 +#define MCTP_USB_MTU_MIN MCTP_USB_BTU +#define MCTP_USB_MTU_MAX (U8_MAX - sizeof(struct mctp_usb_hdr)) +#define MCTP_USB_DMTF_ID 0x1ab4 + +#endif /* __LINUX_USB_MCTP_USB_H */ -- cgit v1.2.3 From f2ffe5a9183d22eec718edac03e8bfcedf4dee70 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 16 Feb 2025 11:07:17 +0800 Subject: crypto: hash - Add request chaining API This adds request chaining to the ahash interface. Request chaining allows multiple requests to be submitted in one shot. An algorithm can elect to receive chained requests by setting the flag CRYPTO_ALG_REQ_CHAIN. If this bit is not set, the API will break up chained requests and submit them one-by-one. A new err field is added to struct crypto_async_request to record the return value for each individual request. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index b164da5e129e..1d2a6c515d58 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -13,6 +13,8 @@ #define _LINUX_CRYPTO_H #include +#include +#include #include #include #include @@ -124,6 +126,9 @@ */ #define CRYPTO_ALG_FIPS_INTERNAL 0x00020000 +/* Set if the algorithm supports request chains. */ +#define CRYPTO_ALG_REQ_CHAIN 0x00040000 + /* * Transform masks and values (for crt_flags). */ @@ -174,6 +179,7 @@ struct crypto_async_request { struct crypto_tfm *tfm; u32 flags; + int err; }; /** @@ -540,5 +546,23 @@ int crypto_comp_decompress(struct crypto_comp *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen); +static inline void crypto_reqchain_init(struct crypto_async_request *req) +{ + req->err = -EINPROGRESS; + INIT_LIST_HEAD(&req->list); +} + +static inline void crypto_request_chain(struct crypto_async_request *req, + struct crypto_async_request *head) +{ + req->err = -EINPROGRESS; + list_add_tail(&req->list, &head->list); +} + +static inline bool crypto_tfm_is_async(struct crypto_tfm *tfm) +{ + return tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; +} + #endif /* _LINUX_CRYPTO_H */ -- cgit v1.2.3 From 439963cdc3aa447dfd3b5abc05fcd25cef4d22dc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 16 Feb 2025 11:07:22 +0800 Subject: crypto: ahash - Add virtual address support This patch adds virtual address support to ahash. Virtual addresses were previously only supported through shash. The user may choose to use virtual addresses with ahash by calling ahash_request_set_virt instead of ahash_request_set_crypt. The API will take care of translating this to an SG list if necessary, unless the algorithm declares that it supports chaining. Therefore in order for an ahash algorithm to support chaining, it must also support virtual addresses directly. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 1d2a6c515d58..61ac11226638 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -126,7 +126,7 @@ */ #define CRYPTO_ALG_FIPS_INTERNAL 0x00020000 -/* Set if the algorithm supports request chains. */ +/* Set if the algorithm supports request chains and virtual addresses. */ #define CRYPTO_ALG_REQ_CHAIN 0x00040000 /* -- cgit v1.2.3 From 3b29bcee8f6f703a5952b85fc2ffcbcfb0862db4 Mon Sep 17 00:00:00 2001 From: Robert Budai Date: Mon, 17 Feb 2025 12:57:45 +0200 Subject: iio: imu: adis: Add custom ops struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a custom ops struct letting users define custom read and write functions. Some adis devices might define a completely different spi protocol from the one used in the default implementation. Co-developed-by: Ramona Gradinariu Signed-off-by: Ramona Gradinariu Co-developed-by: Antoniu Miclaus Signed-off-by: Antoniu Miclaus Co-developed-by: Nuno Sá Signed-off-by: Nuno Sá Signed-off-by: Robert Budai Link: https://patch.msgid.link/20250217105753.605465-2-robert.budai@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 4bb98d9731de..89cfa75ae9ea 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -94,6 +94,18 @@ struct adis_data { unsigned int burst_max_speed_hz; }; +/** + * struct adis_ops: Custom ops for adis devices. + * @write: Custom spi write implementation. + * @read: Custom spi read implementation. + */ +struct adis_ops { + int (*write)(struct adis *adis, unsigned int reg, unsigned int value, + unsigned int size); + int (*read)(struct adis *adis, unsigned int reg, unsigned int *value, + unsigned int size); +}; + /** * struct adis - ADIS device instance data * @spi: Reference to SPI device which owns this ADIS IIO device @@ -101,6 +113,7 @@ struct adis_data { * @data: ADIS chip variant specific data * @burst_extra_len: Burst extra length. Should only be used by devices that can * dynamically change their burst mode length. + * @ops: ops struct for custom read and write functions * @state_lock: Lock used by the device to protect state * @msg: SPI message object * @xfer: SPI transfer objects to be used for a @msg @@ -116,6 +129,7 @@ struct adis { const struct adis_data *data; unsigned int burst_extra_len; + const struct adis_ops *ops; /** * The state_lock is meant to be used during operations that require * a sequence of SPI R/W in order to protect the SPI transfer @@ -168,7 +182,7 @@ int __adis_read_reg(struct adis *adis, unsigned int reg, static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, u8 val) { - return __adis_write_reg(adis, reg, val, 1); + return adis->ops->write(adis, reg, val, 1); } /** @@ -180,7 +194,7 @@ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, u16 val) { - return __adis_write_reg(adis, reg, val, 2); + return adis->ops->write(adis, reg, val, 2); } /** @@ -192,7 +206,7 @@ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg, u32 val) { - return __adis_write_reg(adis, reg, val, 4); + return adis->ops->write(adis, reg, val, 4); } /** @@ -207,7 +221,7 @@ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg, unsigned int tmp; int ret; - ret = __adis_read_reg(adis, reg, &tmp, 2); + ret = adis->ops->read(adis, reg, &tmp, 2); if (ret == 0) *val = tmp; @@ -226,7 +240,7 @@ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg, unsigned int tmp; int ret; - ret = __adis_read_reg(adis, reg, &tmp, 4); + ret = adis->ops->read(adis, reg, &tmp, 4); if (ret == 0) *val = tmp; @@ -244,7 +258,7 @@ static inline int adis_write_reg(struct adis *adis, unsigned int reg, unsigned int val, unsigned int size) { guard(mutex)(&adis->state_lock); - return __adis_write_reg(adis, reg, val, size); + return adis->ops->write(adis, reg, val, size); } /** @@ -258,7 +272,7 @@ static int adis_read_reg(struct adis *adis, unsigned int reg, unsigned int *val, unsigned int size) { guard(mutex)(&adis->state_lock); - return __adis_read_reg(adis, reg, val, size); + return adis->ops->read(adis, reg, val, size); } /** -- cgit v1.2.3 From 7f15d7a7d12dbcb4a846ca19d9565e0c11ea6694 Mon Sep 17 00:00:00 2001 From: Robert Budai Date: Mon, 17 Feb 2025 12:57:46 +0200 Subject: iio: imu: adis: Add reset to custom ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch allows the custom definition of reset functionality for adis object. It is useful in cases where the driver does not need to sleep after the reset since it is handled by the library. Co-developed-by: Ramona Gradinariu Signed-off-by: Ramona Gradinariu Co-developed-by: Antoniu Miclaus Signed-off-by: Antoniu Miclaus Co-developed-by: Nuno Sá Signed-off-by: Nuno Sá Signed-off-by: Robert Budai Link: https://patch.msgid.link/20250217105753.605465-3-robert.budai@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 89cfa75ae9ea..52652f51db2e 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -98,12 +98,15 @@ struct adis_data { * struct adis_ops: Custom ops for adis devices. * @write: Custom spi write implementation. * @read: Custom spi read implementation. + * @reset: Custom sw reset implementation. The custom implementation does not + * need to sleep after the reset. It's done by the library already. */ struct adis_ops { int (*write)(struct adis *adis, unsigned int reg, unsigned int value, unsigned int size); int (*read)(struct adis *adis, unsigned int reg, unsigned int *value, unsigned int size); + int (*reset)(struct adis *adis); }; /** -- cgit v1.2.3 From 9fa98d94131806cae0441d05213d36da480d7389 Mon Sep 17 00:00:00 2001 From: Robert Budai Date: Mon, 17 Feb 2025 12:57:47 +0200 Subject: iio: imu: adis: Add DIAG_STAT register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices may have more than 16 bits of status. This patch allows the user to specify the size of the DIAG_STAT register. It defaults to 2 if not specified. This is mainly for backward compatibility. Co-developed-by: Ramona Gradinariu Signed-off-by: Ramona Gradinariu Co-developed-by: Antoniu Miclaus Signed-off-by: Antoniu Miclaus Co-developed-by: Nuno Sá Signed-off-by: Nuno Sá Signed-off-by: Robert Budai Link: https://patch.msgid.link/20250217105753.605465-4-robert.budai@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index 52652f51db2e..aa160511e265 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -44,6 +44,8 @@ struct adis_timeout { * @glob_cmd_reg: Register address of the GLOB_CMD register * @msc_ctrl_reg: Register address of the MSC_CTRL register * @diag_stat_reg: Register address of the DIAG_STAT register + * @diag_stat_size: Length (in bytes) of the DIAG_STAT register. If 0 the + * default length is 2 bytes long. * @prod_id_reg: Register address of the PROD_ID register * @prod_id: Product ID code that should be expected when reading @prod_id_reg * @self_test_mask: Bitmask of supported self-test operations @@ -70,6 +72,7 @@ struct adis_data { unsigned int glob_cmd_reg; unsigned int msc_ctrl_reg; unsigned int diag_stat_reg; + unsigned int diag_stat_size; unsigned int prod_id_reg; unsigned int prod_id; -- cgit v1.2.3 From 531ca2b9a215d072ffb4b1ff760a73f5e80c9c46 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Wed, 19 Feb 2025 10:58:07 +0200 Subject: net/mlx5: Add new health syndrome error and crr bit offset Add new error value for trust lockdown in health syndrome enum. Also, include the offset for crr bit in the health buffer layout. These changes prepare for downstream patches that update health event handling. Signed-off-by: Shahar Shitrit Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250219085808.349923-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 1 + include/linux/mlx5/mlx5_ifc.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0c48b20f818a..fd37f4e54d76 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -538,6 +538,7 @@ struct mlx5_cmd_layout { }; enum mlx5_rfr_severity_bit_offsets { + MLX5_CRR_BIT_OFFSET = 0x6, MLX5_RFR_BIT_OFFSET = 0x7, }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4f3716e124c9..cc2875e843f7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11119,6 +11119,7 @@ enum { MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR = 0xf, MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR = 0x10, MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PCI_POISONED_ERR = 0x12, + MLX5_INITIAL_SEG_HEALTH_SYNDROME_TRUST_LOCKDOWN_ERR = 0x13, }; struct mlx5_ifc_initial_seg_bits { -- cgit v1.2.3 From 80df31f384b4146a62a01b3d4beb376cc7b9a89e Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 19 Feb 2025 10:58:08 +0200 Subject: net/mlx5: Change POOL_NEXT_SIZE define value and make it global Change POOL_NEXT_SIZE define value from 0 to BIT(30), since this define is used to request the available maximum sized flow table, and zero doesn't make sense for it, whereas some places in the driver use zero explicitly expecting the smallest table size possible but instead due to this define they end up allocating the biggest table size unawarely. In addition move the definition to "include/linux/mlx5/fs.h" to expose the define to IB driver as well, while appropriately renaming it. Signed-off-by: Patrisious Haddad Reviewed-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250219085808.349923-3-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 2a69d9d71276..01cb72d68c23 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -40,6 +40,8 @@ #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) +#define MLX5_FS_MAX_POOL_SIZE BIT(30) + enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_NONE, MLX5_FLOW_DESTINATION_TYPE_VPORT, -- cgit v1.2.3 From 97a705dc1a3654d8d2e466433a897be202a7f0ac Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:22 +0000 Subject: cpufreq/amd-pstate: Use scope based cleanup for cpufreq_policy refs There have been instances in past where refcount decrementing is missed while exiting a function. Use automatic scope based cleanup to avoid such errors. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-12-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- include/linux/cpufreq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7fe0981a7e46..dde5212d256c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -210,6 +210,9 @@ static inline struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu) static inline void cpufreq_cpu_put(struct cpufreq_policy *policy) { } #endif +/* Scope based cleanup macro for cpufreq_policy kobject reference counting */ +DEFINE_FREE(put_cpufreq_policy, struct cpufreq_policy *, if (_T) cpufreq_cpu_put(_T)) + static inline bool policy_is_inactive(struct cpufreq_policy *policy) { return cpumask_empty(policy->cpus); -- cgit v1.2.3 From 69920338f8130da929ade6f93e6fa3e0e68433ee Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 10 Feb 2025 11:51:56 +0100 Subject: gpiolib: sanitize the return value of gpio_chip::request() The return value of the request() callback may be propagated to user-space. If a bad driver returns a positive number, it may confuse user programs. Tighten the API contract and check for positive numbers returned by GPIO controllers. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250210-gpio-sanitize-retvals-v1-2-12ea88506cb2@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 10544f4a03e5..ce22c072337c 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -329,7 +329,8 @@ struct gpio_irq_chip { * @fwnode: optional fwnode providing this controller's properties * @owner: helps prevent removal of modules exporting active GPIOs * @request: optional hook for chip-specific activation, such as - * enabling module power and clock; may sleep + * enabling module power and clock; may sleep; must return 0 on success + * or negative error number on failure * @free: optional hook for chip-specific deactivation, such as * disabling module power and clock; may sleep * @get_direction: returns direction for signal "offset", 0=out, 1=in, -- cgit v1.2.3 From dcf8f3bffa2de2c7f3b5771b63605194ccd2286f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 10 Feb 2025 11:51:57 +0100 Subject: gpiolib: sanitize the return value of gpio_chip::set_config() The return value of the set_config() callback may be propagated to user-space. If a bad driver returns a positive number, it may confuse user programs. Tighten the API contract and check for positive numbers returned by GPIO controllers. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250210-gpio-sanitize-retvals-v1-3-12ea88506cb2@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index ce22c072337c..f2145e938b29 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -349,7 +349,8 @@ struct gpio_irq_chip { * @set: assigns output value for signal "offset" * @set_multiple: assigns output values for multiple signals defined by "mask" * @set_config: optional hook for all kinds of settings. Uses the same - * packed config format as generic pinconf. + * packed config format as generic pinconf. Must return 0 on success and + * a negative error number on failure. * @to_irq: optional hook supporting non-static gpiod_to_irq() mappings; * implementation may not sleep * @dbg_show: optional routine to show contents in debugfs; default code -- cgit v1.2.3 From 18311a766c587fc69b1806f1d5943305903b7e6e Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Wed, 12 Feb 2025 11:55:02 +0530 Subject: err.h: move IOMEM_ERR_PTR() to err.h Since IOMEM_ERR_PTR() macro deals with an error pointer, a better place for it is err.h. This helps avoid dependency on io.h for the users that don't need it. Suggested-by: Andy Shevchenko Signed-off-by: Raag Jadav Acked-by: Arnd Bergmann Signed-off-by: Andy Shevchenko --- include/linux/err.h | 3 +++ include/linux/io.h | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/err.h b/include/linux/err.h index a4dacd745fcf..1d60aa86db53 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -44,6 +44,9 @@ static inline void * __must_check ERR_PTR(long error) /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) +/* Cast an error pointer to __iomem. */ +#define IOMEM_ERR_PTR(error) (__force void __iomem *)ERR_PTR(error) + /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. diff --git a/include/linux/io.h b/include/linux/io.h index 59ec5eea696c..40cb2de73f5e 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -65,8 +65,6 @@ static inline void devm_ioport_unmap(struct device *dev, void __iomem *addr) } #endif -#define IOMEM_ERR_PTR(err) (__force void __iomem *)ERR_PTR(err) - void __iomem *devm_ioremap(struct device *dev, resource_size_t offset, resource_size_t size); void __iomem *devm_ioremap_uc(struct device *dev, resource_size_t offset, -- cgit v1.2.3 From a21cad9312767d26b5257ce0662699bb202cdda1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 12 Feb 2025 11:55:03 +0530 Subject: driver core: Split devres APIs to device/devres.h device.h is a huge header which is hard to follow and easy to miss something. Improve that by splitting devres APIs to device/devres.h. In particular this helps to speedup the build of the code that includes device.h solely for a devres APIs. While at it, cast the error pointers to __iomem using IOMEM_ERR_PTR() and fix sparse warnings. Signed-off-by: Raag Jadav Acked-by: Arnd Bergmann Reviewed-by: Greg Kroah-Hartman Signed-off-by: Andy Shevchenko --- include/linux/device.h | 119 +--------------------------------------- include/linux/device/devres.h | 124 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 118 deletions(-) create mode 100644 include/linux/device/devres.h (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 80a5b3268986..78ca7fd0e625 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -26,9 +26,9 @@ #include #include #include -#include #include #include +#include #include #include #include @@ -281,123 +281,6 @@ int __must_check device_create_bin_file(struct device *dev, void device_remove_bin_file(struct device *dev, const struct bin_attribute *attr); -/* device resource management */ -typedef void (*dr_release_t)(struct device *dev, void *res); -typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data); - -void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, - int nid, const char *name) __malloc; -#define devres_alloc(release, size, gfp) \ - __devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release) -#define devres_alloc_node(release, size, gfp, nid) \ - __devres_alloc_node(release, size, gfp, nid, #release) - -void devres_for_each_res(struct device *dev, dr_release_t release, - dr_match_t match, void *match_data, - void (*fn)(struct device *, void *, void *), - void *data); -void devres_free(void *res); -void devres_add(struct device *dev, void *res); -void *devres_find(struct device *dev, dr_release_t release, - dr_match_t match, void *match_data); -void *devres_get(struct device *dev, void *new_res, - dr_match_t match, void *match_data); -void *devres_remove(struct device *dev, dr_release_t release, - dr_match_t match, void *match_data); -int devres_destroy(struct device *dev, dr_release_t release, - dr_match_t match, void *match_data); -int devres_release(struct device *dev, dr_release_t release, - dr_match_t match, void *match_data); - -/* devres group */ -void * __must_check devres_open_group(struct device *dev, void *id, gfp_t gfp); -void devres_close_group(struct device *dev, void *id); -void devres_remove_group(struct device *dev, void *id); -int devres_release_group(struct device *dev, void *id); - -/* managed devm_k.alloc/kfree for device drivers */ -void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __alloc_size(2); -void *devm_krealloc(struct device *dev, void *ptr, size_t size, - gfp_t gfp) __must_check __realloc_size(3); -__printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp, - const char *fmt, va_list ap) __malloc; -__printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp, - const char *fmt, ...) __malloc; -static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp) -{ - return devm_kmalloc(dev, size, gfp | __GFP_ZERO); -} -static inline void *devm_kmalloc_array(struct device *dev, - size_t n, size_t size, gfp_t flags) -{ - size_t bytes; - - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; - - return devm_kmalloc(dev, bytes, flags); -} -static inline void *devm_kcalloc(struct device *dev, - size_t n, size_t size, gfp_t flags) -{ - return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO); -} -static inline __realloc_size(3, 4) void * __must_check -devm_krealloc_array(struct device *dev, void *p, size_t new_n, size_t new_size, gfp_t flags) -{ - size_t bytes; - - if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) - return NULL; - - return devm_krealloc(dev, p, bytes, flags); -} - -void devm_kfree(struct device *dev, const void *p); -char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc; -const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp); -void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) - __realloc_size(3); - -unsigned long devm_get_free_pages(struct device *dev, - gfp_t gfp_mask, unsigned int order); -void devm_free_pages(struct device *dev, unsigned long addr); - -#ifdef CONFIG_HAS_IOMEM -void __iomem *devm_ioremap_resource(struct device *dev, - const struct resource *res); -void __iomem *devm_ioremap_resource_wc(struct device *dev, - const struct resource *res); - -void __iomem *devm_of_iomap(struct device *dev, - struct device_node *node, int index, - resource_size_t *size); -#else - -static inline -void __iomem *devm_ioremap_resource(struct device *dev, - const struct resource *res) -{ - return ERR_PTR(-EINVAL); -} - -static inline -void __iomem *devm_ioremap_resource_wc(struct device *dev, - const struct resource *res) -{ - return ERR_PTR(-EINVAL); -} - -static inline -void __iomem *devm_of_iomap(struct device *dev, - struct device_node *node, int index, - resource_size_t *size) -{ - return ERR_PTR(-EINVAL); -} - -#endif - /* allows to add/remove a custom action to devres stack */ int devm_remove_action_nowarn(struct device *dev, void (*action)(void *), void *data); diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h new file mode 100644 index 000000000000..6b0b265058bc --- /dev/null +++ b/include/linux/device/devres.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _DEVICE_DEVRES_H_ +#define _DEVICE_DEVRES_H_ + +#include +#include +#include +#include +#include +#include + +struct device; +struct device_node; +struct resource; + +/* device resource management */ +typedef void (*dr_release_t)(struct device *dev, void *res); +typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data); + +void * __malloc +__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid, const char *name); +#define devres_alloc(release, size, gfp) \ + __devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release) +#define devres_alloc_node(release, size, gfp, nid) \ + __devres_alloc_node(release, size, gfp, nid, #release) + +void devres_for_each_res(struct device *dev, dr_release_t release, + dr_match_t match, void *match_data, + void (*fn)(struct device *, void *, void *), + void *data); +void devres_free(void *res); +void devres_add(struct device *dev, void *res); +void *devres_find(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); +void *devres_get(struct device *dev, void *new_res, dr_match_t match, void *match_data); +void *devres_remove(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); +int devres_destroy(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); +int devres_release(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); + +/* devres group */ +void * __must_check devres_open_group(struct device *dev, void *id, gfp_t gfp); +void devres_close_group(struct device *dev, void *id); +void devres_remove_group(struct device *dev, void *id); +int devres_release_group(struct device *dev, void *id); + +/* managed devm_k.alloc/kfree for device drivers */ +void * __alloc_size(2) +devm_kmalloc(struct device *dev, size_t size, gfp_t gfp); +void * __must_check __realloc_size(3) +devm_krealloc(struct device *dev, void *ptr, size_t size, gfp_t gfp); +static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp) +{ + return devm_kmalloc(dev, size, gfp | __GFP_ZERO); +} +static inline void *devm_kmalloc_array(struct device *dev, size_t n, size_t size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + + return devm_kmalloc(dev, bytes, flags); +} +static inline void *devm_kcalloc(struct device *dev, size_t n, size_t size, gfp_t flags) +{ + return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO); +} +static inline __realloc_size(3, 4) void * __must_check +devm_krealloc_array(struct device *dev, void *p, size_t new_n, size_t new_size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) + return NULL; + + return devm_krealloc(dev, p, bytes, flags); +} + +void devm_kfree(struct device *dev, const void *p); + +void * __realloc_size(3) +devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); + +char * __malloc +devm_kstrdup(struct device *dev, const char *s, gfp_t gfp); +const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp); +char * __printf(3, 0) __malloc +devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap); +char * __printf(3, 4) __malloc +devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...); + +unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order); +void devm_free_pages(struct device *dev, unsigned long addr); + +#ifdef CONFIG_HAS_IOMEM + +void __iomem *devm_ioremap_resource(struct device *dev, const struct resource *res); +void __iomem *devm_ioremap_resource_wc(struct device *dev, const struct resource *res); + +void __iomem *devm_of_iomap(struct device *dev, struct device_node *node, int index, + resource_size_t *size); +#else + +static inline +void __iomem *devm_ioremap_resource(struct device *dev, const struct resource *res) +{ + return IOMEM_ERR_PTR(-EINVAL); +} + +static inline +void __iomem *devm_ioremap_resource_wc(struct device *dev, const struct resource *res) +{ + return IOMEM_ERR_PTR(-EINVAL); +} + +static inline +void __iomem *devm_of_iomap(struct device *dev, struct device_node *node, int index, + resource_size_t *size) +{ + return IOMEM_ERR_PTR(-EINVAL); +} + +#endif + +#endif /* _DEVICE_DEVRES_H_ */ -- cgit v1.2.3 From a103b833ac3806b816bc993cba77d0b17cf801f1 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Wed, 12 Feb 2025 11:55:05 +0530 Subject: devres: Introduce devm_kmemdup_array() Introduce '_array' variant of devm_kmemdup() which is more robust and consistent with alloc family of helpers. Suggested-by: Andy Shevchenko Signed-off-by: Raag Jadav Reviewed-by: Dmitry Torokhov Reviewed-by: Linus Walleij Signed-off-by: Andy Shevchenko --- include/linux/device/devres.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h index 6b0b265058bc..9b49f9915850 100644 --- a/include/linux/device/devres.h +++ b/include/linux/device/devres.h @@ -79,6 +79,11 @@ void devm_kfree(struct device *dev, const void *p); void * __realloc_size(3) devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); +static inline void *devm_kmemdup_array(struct device *dev, const void *src, + size_t n, size_t size, gfp_t flags) +{ + return devm_kmemdup(dev, src, size_mul(size, n), flags); +} char * __malloc devm_kstrdup(struct device *dev, const char *s, gfp_t gfp); -- cgit v1.2.3 From 12851bd921d429c60578b90916fc220b60757c34 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 21 Feb 2025 20:39:29 +0000 Subject: fs: Turn page_offset() into a wrapper around folio_pos() This is far less efficient for the lagging filesystems which still use page_offset(), but it removes an access to page->index. It also fixes a bug -- if any filesystem passed a tail page to page_offset(), it would return garbage which might result in the filesystem choosing to not writeback a dirty page. There probably aren't any examples of this, but I can't be certain. Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250221203932.3588740-1-willy@infradead.org Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..f348e7005306 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1044,21 +1044,23 @@ static inline pgoff_t page_pgoff(const struct folio *folio, return folio->index + folio_page_idx(folio, page); } -/* - * Return byte-offset into filesystem object for page. +/** + * folio_pos - Returns the byte position of this folio in its file. + * @folio: The folio. */ -static inline loff_t page_offset(struct page *page) +static inline loff_t folio_pos(const struct folio *folio) { - return ((loff_t)page->index) << PAGE_SHIFT; + return ((loff_t)folio->index) * PAGE_SIZE; } -/** - * folio_pos - Returns the byte position of this folio in its file. - * @folio: The folio. +/* + * Return byte-offset into filesystem object for page. */ -static inline loff_t folio_pos(struct folio *folio) +static inline loff_t page_offset(struct page *page) { - return page_offset(&folio->page); + struct folio *folio = page_folio(page); + + return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE; } /* -- cgit v1.2.3 From 47dd67532303803a87f43195e088b3b4bcf0454d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Feb 2025 14:38:22 -0800 Subject: block/bdev: lift block size restrictions to 64k We now can support blocksizes larger than PAGE_SIZE, so in theory we should be able to lift the restriction up to the max supported page cache order. However bound ourselves to what we can currently validate and test. Through blktests and fstest we can validate up to 64k today. Reviewed-by: Hannes Reinecke Reviewed-by: "Matthew Wilcox (Oracle)" Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20250221223823.1680616-8-mcgrof@kernel.org Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c..a97428e8bbbe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -267,10 +267,16 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +/* + * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER) + * however we constrain this to what we can validate and test. + */ +#define BLK_MAX_BLOCK_SIZE SZ_64K + /* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) { - if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) + if (bsize < 512 || bsize > BLK_MAX_BLOCK_SIZE || !is_power_of_2(bsize)) return -EINVAL; return 0; -- cgit v1.2.3 From 0dffacbbf8d044456d50c893adb9499775c489f4 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Thu, 20 Feb 2025 19:58:04 +0100 Subject: regulator: Add (devm_)of_regulator_get() The Rockchip power-domain controller also plans to make use of per-domain regulators similar to the MediaTek power-domain controller. Since existing DTs are missing the regulator information, the kernel should fallback to the automatically created dummy regulator if necessary. Thus the version without the _optional suffix is needed. The Rockchip driver plans to use the managed version, but to be consistent with existing code the unmanaged version is added at the same time. Tested-by: Heiko Stuebner Signed-off-by: Sebastian Reichel Link: https://patch.msgid.link/20250220-rk3588-gpu-pwr-domain-regulator-v6-1-a4f9c24e5b81@kernel.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index ffe912f345ae..56fe2693d9b2 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -677,6 +677,12 @@ regulator_is_equal(struct regulator *reg1, struct regulator *reg2) #endif #if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_REGULATOR) +struct regulator *__must_check of_regulator_get(struct device *dev, + struct device_node *node, + const char *id); +struct regulator *__must_check devm_of_regulator_get(struct device *dev, + struct device_node *node, + const char *id); struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct device_node *node, const char *id); -- cgit v1.2.3 From cc9554e662a3b5a13f514cfcced54ac263c07094 Mon Sep 17 00:00:00 2001 From: Yonatan Goldschmidt Date: Mon, 24 Feb 2025 00:32:34 +0200 Subject: binfmt: Remove loader from linux_binprm struct Commit 987f20a9dcce ("a.out: Remove the a.out implementation") removed the last in-tree user of the loader field, and as far as I can tell, it was the only one historically. Signed-off-by: Yonatan Goldschmidt Link: https://lore.kernel.org/r/20250223223234.13764-1-yon.goldschmidt@gmail.com Signed-off-by: Kees Cook --- include/linux/binfmts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 60d674af3080..1625c8529e70 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -64,7 +64,7 @@ struct linux_binprm { const char *fdpath; /* generated filename for execveat */ unsigned interp_flags; int execfd; /* File descriptor of the executable */ - unsigned long loader, exec; + unsigned long exec; struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ -- cgit v1.2.3 From 2145ba374069ee8edc9d29c2a6b56fe4a28a6e2d Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 19 Feb 2025 22:04:33 +0100 Subject: gpio: mmio: Add flag for calling pinctrl back-end It turns out that with this flag we can switch over an entire driver to use gpio-mmio instead of a bunch of custom code, also providing get/set_multiple() to it in the process, so it seems like a reasonable feature to add. The generic pin control backend requires us to call the gpiochip_generic_request(), gpiochip_generic_free(), pinctrl_gpio_direction_output() and pinctrl_gpio_direction_input() callbacks, so if the new flag for a pin control back-end is set, we make sure these functions get called as expected. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20250219-vf610-mmio-v3-1-588b64f0b689@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index f2145e938b29..ae96a2f260fb 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -397,6 +397,7 @@ struct gpio_irq_chip { * @reg_dir_in: direction in setting register for generic GPIO * @bgpio_dir_unreadable: indicates that the direction register(s) cannot * be read and we need to rely on out internal state tracking. + * @bgpio_pinctrl: the generic GPIO uses a pin control backend. * @bgpio_bits: number of register bits used for a generic GPIO i.e. * * 8 * @bgpio_lock: used to lock chip->bgpio_data. Also, this is needed to keep @@ -481,6 +482,7 @@ struct gpio_chip { void __iomem *reg_dir_out; void __iomem *reg_dir_in; bool bgpio_dir_unreadable; + bool bgpio_pinctrl; int bgpio_bits; raw_spinlock_t bgpio_lock; unsigned long bgpio_data; @@ -721,6 +723,7 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev, #define BGPIOF_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ #define BGPIOF_NO_OUTPUT BIT(5) /* only input */ #define BGPIOF_NO_SET_ON_INPUT BIT(6) +#define BGPIOF_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ #ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, -- cgit v1.2.3 From dc5bb9b769c9c3e471609a4e7444ab539c5f3f1f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 28 Jan 2025 11:46:34 -0500 Subject: cpumask: deprecate cpumask_next_wrap() The next patch aligns implementation of cpumask_next_wrap() with the find_next_bit_wrap(), and it changes function signature. To make the transition smooth, this patch deprecates current implementation by adding an _old suffix. The following patches switch current users to the new implementation one by one. No functional changes were intended. Signed-off-by: Yury Norov --- include/linux/cpumask.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index e57fd41ca38e..7f7f17cfd21d 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -296,7 +296,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, #if NR_CPUS == 1 static __always_inline -unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) +unsigned int cpumask_next_wrap_old(int n, const struct cpumask *mask, int start, bool wrap) { cpumask_check(start); if (n != -1) @@ -312,7 +312,7 @@ unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, boo return cpumask_first(mask); } #else -unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); +unsigned int __pure cpumask_next_wrap_old(int n, const struct cpumask *mask, int start, bool wrap); #endif /** -- cgit v1.2.3 From 3268cb2e49cc2a2d142f210efdc82602639d2047 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 28 Jan 2025 11:46:35 -0500 Subject: cpumask: re-introduce cpumask_next{,_and}_wrap() cpumask_next_wrap_old() has two additional parameters, comparing to its generic counterpart find_next_bit_wrap(). The reason for that is historical. Before 4fe49b3b97c262 ("lib/bitmap: introduce for_each_set_bit_wrap() macro"), cpumask_next_wrap() was used to implement for_each_cpu_wrap() iterator. Now that the iterator is an alias to generic for_each_set_bit_wrap(), the additional parameters aren't used and may confuse readers. All existing users call cpumask_next_wrap() in a way that makes it possible to turn it to straight and simple alias to find_next_bit_wrap(). In a couple of places kernel users opencode missing cpumask_next_and_wrap(). Add it as well. CC: Alexander Gordeev CC: Bjorn Helgaas Signed-off-by: Yury Norov --- include/linux/cpumask.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 7f7f17cfd21d..87d9784e009d 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -284,6 +284,44 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, small_cpumask_bits, n + 1); } +/** + * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from + * @n+1. If nothing found, wrap around and start from + * the beginning + * @n: the cpu prior to the place to search (i.e. search starts from @n+1) + * @src1p: the first cpumask pointer + * @src2p: the second cpumask pointer + * + * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty. + */ +static __always_inline +unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), + small_cpumask_bits, n + 1); +} + +/** + * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing + * found, wrap around and start from the beginning + * @n: the cpu prior to the place to search (i.e. search starts from @n+1) + * @src: cpumask pointer + * + * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty. + */ +static __always_inline +unsigned int cpumask_next_wrap(int n, const struct cpumask *src) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1); +} + /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator -- cgit v1.2.3 From 14c384131ea09fb70e9e01b0a3f2c3d3cd56d832 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Tue, 28 Jan 2025 11:46:42 -0500 Subject: cpumask: drop cpumask_next_wrap_old() Now that we have cpumask_next_wrap() wired to generic find_next_bit_wrap(), the old implementation is not needed. Signed-off-by: Yury Norov --- include/linux/cpumask.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 87d9784e009d..9289d4612170 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -332,27 +332,6 @@ unsigned int cpumask_next_wrap(int n, const struct cpumask *src) #define for_each_cpu(cpu, mask) \ for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) -#if NR_CPUS == 1 -static __always_inline -unsigned int cpumask_next_wrap_old(int n, const struct cpumask *mask, int start, bool wrap) -{ - cpumask_check(start); - if (n != -1) - cpumask_check(n); - - /* - * Return the first available CPU when wrapping, or when starting before cpu0, - * since there is only one valid option. - */ - if (wrap && n >= 0) - return nr_cpumask_bits; - - return cpumask_first(mask); -} -#else -unsigned int __pure cpumask_next_wrap_old(int n, const struct cpumask *mask, int start, bool wrap); -#endif - /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location * @cpu: the (optionally unsigned) integer iterator -- cgit v1.2.3 From 89ac4a59ca6d98091e8317bc5a342f38669e19e2 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 21 Feb 2025 12:07:28 +0100 Subject: skbuff: kill skb_flow_get_ports() Since commit a815bde56b15 ("net, bonding: Refactor bond_xmit_hash for use with xdp_buff"), this function is not used anymore. Signed-off-by: Nicolas Dichtel Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250221110941.2041629-2-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0b4f1889500d..21644e9215e4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1536,12 +1536,6 @@ u32 __skb_get_poff(const struct sk_buff *skb, const void *data, __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen_proto); -static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, - int thoff, u8 ip_proto) -{ - return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); -} - void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); -- cgit v1.2.3 From c52fd4f083cc634c57fc98fce36860e63f6bce2b Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 21 Feb 2025 12:07:29 +0100 Subject: net: remove '__' from __skb_flow_get_ports() Only one version of skb_flow_get_ports() exists after the previous commit, so let's remove the useless '__'. Suggested-by: Simon Horman Signed-off-by: Nicolas Dichtel Link: https://patch.msgid.link/20250221110941.2041629-3-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 21644e9215e4..f2bb8473d99a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1533,8 +1533,8 @@ void __skb_get_hash_net(const struct net *net, struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen); -__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - const void *data, int hlen_proto); +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + const void *data, int hlen_proto); void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, -- cgit v1.2.3 From 85e4a808af2545fefaf18c8fe50071b06fcbdabc Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 20 Feb 2025 23:39:53 +0200 Subject: net/mlx5e: Add correct match to check IPSec syndromes for switchdev mode In commit dddb49b63d86 ("net/mlx5e: Add IPsec and ASO syndromes check in HW"), IPSec and ASO syndromes checks after decryption for the specified ASO object were added. But they are correct only for eswith in legacy mode. For switchdev mode, metadata register c1 is used to save the mapped id (not ASO object id). So, need to change the match accordingly for the check rules in status table. Signed-off-by: Jianbo Liu Reviewed-by: Leon Romanovsky Reviewed-by: Patrisious Haddad Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250220213959.504304-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/eswitch.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index df73a2ccc9af..67256e776566 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -147,6 +147,8 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, /* reuse tun_opts for the mapped ipsec obj id when tun_id is 0 (invalid) */ #define ESW_IPSEC_RX_MAPPED_ID_MASK GENMASK(ESW_TUN_OPTS_BITS - 1, 0) +#define ESW_IPSEC_RX_MAPPED_ID_MATCH_MASK \ + GENMASK(31 - ESW_RESERVED_BITS, ESW_ZONE_ID_BITS) u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev); u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); -- cgit v1.2.3 From a3e51d4711793be001220784bd7d8ce81517003e Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 22 Feb 2025 09:36:10 +0100 Subject: net: phy: add phylib-internal.h This patch is a starting point for moving phylib-internal declarations to a private header file. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/082eacd2-a888-4716-8797-b3491ce02820@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 13be48d3b8b3..7bfbae51070a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -181,13 +181,6 @@ static inline void phy_interface_set_rgmii(unsigned long *intf) __set_bit(PHY_INTERFACE_MODE_RGMII_TXID, intf); } -/* - * phy_supported_speeds - return all speeds currently supported by a PHY device - */ -unsigned int phy_supported_speeds(struct phy_device *phy, - unsigned int *speeds, - unsigned int size); - /** * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value @@ -1331,10 +1324,6 @@ phy_lookup_setting(int speed, int duplex, const unsigned long *mask, bool exact); size_t phy_speeds(unsigned int *speeds, size_t size, unsigned long *mask); -void of_set_phy_supported(struct phy_device *phydev); -void of_set_phy_eee_broken(struct phy_device *phydev); -void of_set_phy_timing_role(struct phy_device *phydev); -int phy_speed_down_core(struct phy_device *phydev); /** * phy_is_started - Convenience function to check whether PHY is started @@ -1360,7 +1349,6 @@ static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); -void phy_check_downshift(struct phy_device *phydev); /** * phy_read - Convenience function for reading a given PHY register @@ -2035,7 +2023,6 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev, int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); -int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; -- cgit v1.2.3 From 287044abff8291993ce9565ac6e6a72b85e33b85 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 23 Feb 2025 21:45:07 +0100 Subject: sctp: Remove unused payload from sctp_idatahdr Remove the unused payload array from the struct sctp_idatahdr. Cc: Kees Cook Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250223204505.2499-3-thorsten.blum@linux.dev Signed-off-by: Paolo Abeni --- include/linux/sctp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 812011d8b67e..6719949135c9 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -238,7 +238,6 @@ struct sctp_idatahdr { __u32 ppid; __be32 fsn; }; - __u8 payload[0]; }; struct sctp_idata_chunk { -- cgit v1.2.3 From db99ea5f2c0361c8fc2878792e97c7b67c811bd0 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Wed, 12 Feb 2025 14:36:39 +0000 Subject: EDAC: Add support for EDAC device features control Add generic EDAC device feature controls supporting the registration of RAS features available in the system. The driver exposes control attributes for these features to userspace in /sys/bus/edac/devices// [ bp: Touch-up documentation, simplify, make edac_dev_type static, fixup edac_dev_register() retvals. ] Co-developed-by: Jonathan Cameron Signed-off-by: Jonathan Cameron Signed-off-by: Shiju Jose Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fan Ni Tested-by: Daniel Ferguson Tested-by: Fan Ni Link: https://lore.kernel.org/r/20250212143654.1893-2-shiju.jose@huawei.com --- include/linux/edac.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index b4ee8961e623..8c4b6ca2a994 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -661,4 +661,30 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, return mci->dimms[index]; } + +/* RAS feature type */ +enum edac_dev_feat { + RAS_FEAT_MAX +}; + +/* EDAC device feature information structure */ +struct edac_dev_data { + u8 instance; + void *private; +}; + +struct edac_dev_feat_ctx { + struct device dev; + void *private; +}; + +struct edac_dev_feature { + enum edac_dev_feat ft_type; + u8 instance; + void *ctx; +}; + +int edac_dev_register(struct device *parent, char *dev_name, + void *parent_pvt_data, int num_features, + const struct edac_dev_feature *ras_features); #endif /* _LINUX_EDAC_H_ */ -- cgit v1.2.3 From f90b738166fe909df48de6a03744ddfbad5002f8 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Wed, 12 Feb 2025 14:36:40 +0000 Subject: EDAC: Add scrub control feature Add a scrub control to manage memory scrubbers in the system. Devices with a scrub feature register with the EDAC device driver which retrieves the scrub descriptor from the scrub driver and exposes the control attributes for a instance to userspace at /sys/bus/edac/devices//scrubX/. The common sysfs scrub control interface abstracts the control of arbitrary scrubbing functionality into a common set of functions. The attribute nodes are only present if the client driver has implemented the corresponding attribute callback function and passed the operations to the device driver during registration. [ bp: Massage commit message, docs and code, simplify text a bit. Integrate fixup for: https://lore.kernel.org/r/202502251009.0sGkolEJ-lkp@intel.com Reported-by: kernel test robot Reported-by: Dan Carpenter ] Co-developed-by: Jonathan Cameron Signed-off-by: Jonathan Cameron Signed-off-by: Shiju Jose Signed-off-by: Borislav Petkov (AMD) Tested-by: Daniel Ferguson Tested-by: Fan Ni Link: https://lore.kernel.org/r/20250212143654.1893-3-shiju.jose@huawei.com --- include/linux/edac.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index 8c4b6ca2a994..1cbab08720df 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -662,13 +662,54 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, return mci->dimms[index]; } +#define EDAC_FEAT_NAME_LEN 128 + /* RAS feature type */ enum edac_dev_feat { + RAS_FEAT_SCRUB, RAS_FEAT_MAX }; +/** + * struct edac_scrub_ops - scrub device operations (all elements optional) + * @read_addr: read base address of scrubbing range. + * @read_size: read offset of scrubbing range. + * @write_addr: set base address of the scrubbing range. + * @write_size: set offset of the scrubbing range. + * @get_enabled_bg: check if currently performing background scrub. + * @set_enabled_bg: start or stop a bg-scrub. + * @get_min_cycle: get minimum supported scrub cycle duration in seconds. + * @get_max_cycle: get maximum supported scrub cycle duration in seconds. + * @get_cycle_duration: get current scrub cycle duration in seconds. + * @set_cycle_duration: set current scrub cycle duration in seconds. + */ +struct edac_scrub_ops { + int (*read_addr)(struct device *dev, void *drv_data, u64 *base); + int (*read_size)(struct device *dev, void *drv_data, u64 *size); + int (*write_addr)(struct device *dev, void *drv_data, u64 base); + int (*write_size)(struct device *dev, void *drv_data, u64 size); + int (*get_enabled_bg)(struct device *dev, void *drv_data, bool *enable); + int (*set_enabled_bg)(struct device *dev, void *drv_data, bool enable); + int (*get_min_cycle)(struct device *dev, void *drv_data, u32 *min); + int (*get_max_cycle)(struct device *dev, void *drv_data, u32 *max); + int (*get_cycle_duration)(struct device *dev, void *drv_data, u32 *cycle); + int (*set_cycle_duration)(struct device *dev, void *drv_data, u32 cycle); +}; + +#if IS_ENABLED(CONFIG_EDAC_SCRUB) +int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, + u8 instance); +#else +static inline int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, + u8 instance) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_SCRUB */ + /* EDAC device feature information structure */ struct edac_dev_data { + const struct edac_scrub_ops *scrub_ops; u8 instance; void *private; }; @@ -676,11 +717,13 @@ struct edac_dev_data { struct edac_dev_feat_ctx { struct device dev; void *private; + struct edac_dev_data *scrub; }; struct edac_dev_feature { enum edac_dev_feat ft_type; u8 instance; + const struct edac_scrub_ops *scrub_ops; void *ctx; }; -- cgit v1.2.3 From bcbd069b11b024994e30c7c2f3d716a4141fdab1 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Wed, 12 Feb 2025 14:36:41 +0000 Subject: EDAC: Add a Error Check Scrub control feature Add an Error Check Scrub (ECS) control to manage a memory device's ECS feature. The ECS is a feature defined in JEDEC DDR5 SDRAM Specification (JESD79-5) and allows the DRAM to internally read, correct single-bit errors, and write back corrected data bits to the DRAM array while providing transparency to error counts. The DDR5 device contains a number of memory media Field Replaceable Units (FRU) per device. The DDR5 ECS feature and thus the ECS control driver supports configuring the ECS parameters per FRU. Memory devices support the ECS feature register with the EDAC device driver, which retrieves the ECS descriptor from the EDAC ECS driver. This driver exposes sysfs ECS control attributes to userspace via /sys/bus/edac/devices//ecs_fruX/. The common sysfs ECS control interface abstracts the control of an arbitrary ECS functionality to a common set of functions. Support for the ECS feature is added separately because the control attributes of the DDR5 ECS feature differ from those of the scrub feature. The sysfs ECS attribute nodes are only present if the client driver has implemented the corresponding attribute callback function and passed the necessary operations to the EDAC RAS feature driver during registration. [ bp: Massage, fixup edac_dev_register() retvals. ] Co-developed-by: Jonathan Cameron Signed-off-by: Jonathan Cameron Signed-off-by: Shiju Jose Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fan Ni Tested-by: Fan Ni Link: https://lore.kernel.org/r/20250212143654.1893-4-shiju.jose@huawei.com --- include/linux/edac.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index 1cbab08720df..f8346014c14e 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -667,6 +667,7 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, /* RAS feature type */ enum edac_dev_feat { RAS_FEAT_SCRUB, + RAS_FEAT_ECS, RAS_FEAT_MAX }; @@ -707,9 +708,47 @@ static inline int edac_scrub_get_desc(struct device *scrub_dev, { return -EOPNOTSUPP; } #endif /* CONFIG_EDAC_SCRUB */ +/** + * struct edac_ecs_ops - ECS device operations (all elements optional) + * @get_log_entry_type: read the log entry type value. + * @set_log_entry_type: set the log entry type value. + * @get_mode: read the mode value. + * @set_mode: set the mode value. + * @reset: reset the ECS counter. + * @get_threshold: read the threshold count per gigabits of memory cells. + * @set_threshold: set the threshold count per gigabits of memory cells. + */ +struct edac_ecs_ops { + int (*get_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 *val); + int (*set_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*get_mode)(struct device *dev, void *drv_data, int fru_id, u32 *val); + int (*set_mode)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*reset)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*get_threshold)(struct device *dev, void *drv_data, int fru_id, u32 *threshold); + int (*set_threshold)(struct device *dev, void *drv_data, int fru_id, u32 threshold); +}; + +struct edac_ecs_ex_info { + u16 num_media_frus; +}; + +#if IS_ENABLED(CONFIG_EDAC_ECS) +int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, + u16 num_media_frus); +#else +static inline int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, + u16 num_media_frus) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_ECS */ + /* EDAC device feature information structure */ struct edac_dev_data { - const struct edac_scrub_ops *scrub_ops; + union { + const struct edac_scrub_ops *scrub_ops; + const struct edac_ecs_ops *ecs_ops; + }; u8 instance; void *private; }; @@ -718,13 +757,18 @@ struct edac_dev_feat_ctx { struct device dev; void *private; struct edac_dev_data *scrub; + struct edac_dev_data ecs; }; struct edac_dev_feature { enum edac_dev_feat ft_type; u8 instance; - const struct edac_scrub_ops *scrub_ops; + union { + const struct edac_scrub_ops *scrub_ops; + const struct edac_ecs_ops *ecs_ops; + }; void *ctx; + struct edac_ecs_ex_info ecs_info; }; int edac_dev_register(struct device *parent, char *dev_name, -- cgit v1.2.3 From 889c57066ceee5e9172232da0608a8ac053bb6e5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Feb 2025 10:21:41 +0800 Subject: block: make segment size limit workable for > 4K PAGE_SIZE Using PAGE_SIZE as a minimum expected DMA segment size in consideration of devices which have a max DMA segment size of < 64k when used on 64k PAGE_SIZE systems leads to devices not being able to probe such as eMMC and Exynos UFS controller [0] [1] you can end up with a probe failure as follows: WARNING: CPU: 2 PID: 397 at block/blk-settings.c:339 blk_validate_limits+0x364/0x3c0 Ensure we use min(max_seg_size, seg_boundary_mask + 1) as the new min segment size when max segment size is < PAGE_SIZE for 16k and 64k base page size systems. If anyone need to backport this patch, the following commits are depended: commit 6aeb4f836480 ("block: remove bio_add_pc_page") commit 02ee5d69e3ba ("block: remove blk_rq_bio_prep") commit b7175e24d6ac ("block: add a dma mapping iterator") Link: https://lore.kernel.org/linux-block/20230612203314.17820-1-bvanassche@acm.org/ # [0] Link: https://lore.kernel.org/linux-block/1d55e942-5150-de4c-3a02-c3d066f87028@acm.org/ # [1] Cc: Yi Zhang Cc: John Garry Cc: Keith Busch Tested-by: Paul Bunyan Reviewed-by: Daniel Gomez Reviewed-by: Luis Chamberlain Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250225022141.2154581-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c..58ff5aca83b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -367,6 +367,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; + unsigned int min_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; -- cgit v1.2.3 From 73cfc53cc3b6380eccf013049574485f64cb83ca Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 21 Feb 2025 14:57:07 +0100 Subject: objtool: Fix C jump table annotations for Clang A C jump table (such as the one used by the BPF interpreter) is a const global array of absolute code addresses, and this means that the actual values in the table may not be known until the kernel is booted (e.g., when using KASLR or when the kernel VA space is sized dynamically). When using PIE codegen, the compiler will default to placing such const global objects in .data.rel.ro (which is annotated as writable), rather than .rodata (which is annotated as read-only). As C jump tables are explicitly emitted into .rodata, this used to result in warnings for LoongArch builds (which uses PIE codegen for the entire kernel) like Warning: setting incorrect section attributes for .rodata..c_jump_table due to the fact that the explicitly specified .rodata section inherited the read-write annotation that the compiler uses for such objects when using PIE codegen. This warning was suppressed by explicitly adding the read-only annotation to the __attribute__((section(""))) string, by commit c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table") Unfortunately, this hack does not work on Clang's integrated assembler, which happily interprets the appended section type and permission specifiers as part of the section name, which therefore no longer matches the hard-coded pattern '.rodata..c_jump_table' that objtool expects, causing it to emit a warning kernel/bpf/core.o: warning: objtool: ___bpf_prog_run+0x20: sibling call from callable instruction with modified stack frame Work around this, by emitting C jump tables into .data.rel.ro instead, which is treated as .rodata by the linker script for all builds, not just PIE based ones. Fixes: c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table") Tested-by: Tiezhu Yang # on LoongArch Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250221135704.431269-6-ardb+git@google.com Signed-off-by: Josh Poimboeuf --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b087de2f3e94..0c25f3e429bb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -110,7 +110,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Unreachable code */ #ifdef CONFIG_OBJTOOL /* Annotate a C jump table to allow objtool to follow the code flow */ -#define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #") +#define __annotate_jump_table __section(".data.rel.ro.c_jump_table") #else /* !CONFIG_OBJTOOL */ #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ -- cgit v1.2.3 From 9084ed79ddaaaa1ec01cd304af9fb532c26252db Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 20 Feb 2025 14:29:36 -0500 Subject: lsm,nfs: fix memory leak of lsm_context commit b530104f50e8 ("lsm: lsm_context in security_dentry_init_security") did not preserve the lsm id for subsequent release calls, which results in a memory leak. Fix it by saving the lsm id in the nfs4_label and providing it on the subsequent release call. Fixes: b530104f50e8 ("lsm: lsm_context in security_dentry_init_security") Signed-off-by: Stephen Smalley Acked-by: Paul Moore Acked-by: Casey Schaufler Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 71fbebfa43c7..9ac83ca88326 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -47,6 +47,7 @@ struct nfs4_acl { struct nfs4_label { uint32_t lfs; uint32_t pi; + u32 lsmid; u32 len; char *label; }; -- cgit v1.2.3 From 18912c520674ec4d920fe3826e7e4fefeecdf5ae Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 24 Feb 2025 09:44:01 -0800 Subject: tcp: devmem: don't write truncated dmabuf CMSGs to userspace Currently, we report -ETOOSMALL (err) only on the first iteration (!sent). When we get put_cmsg error after a bunch of successful put_cmsg calls, we don't signal the error at all. This might be confusing on the userspace side which will see truncated CMSGs but no MSG_CTRUNC signal. Consider the following case: - sizeof(struct cmsghdr) = 16 - sizeof(struct dmabuf_cmsg) = 24 - total cmsg size (CMSG_LEN) = 40 (16+24) When calling recvmsg with msg_controllen=60, the userspace will receive two(!) dmabuf_cmsg(s), the first one will be a valid one and the second one will be silently truncated. There is no easy way to discover the truncation besides doing something like "cm->cmsg_len != CMSG_LEN(sizeof(dmabuf_cmsg))". Introduce new put_devmem_cmsg wrapper that reports an error instead of doing the truncation. Mina suggests that it's the intended way this API should work. Note that we might now report MSG_CTRUNC when the users (incorrectly) call us with msg_control == NULL. Fixes: 8f0b3cc9a4c1 ("tcp: RX path for devmem TCP") Reviewed-by: Mina Almasry Signed-off-by: Stanislav Fomichev Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250224174401.3582695-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index d18cc47e89bd..c3322eb3d686 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -392,6 +392,8 @@ struct ucred { extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data); struct timespec64; struct __kernel_timespec; -- cgit v1.2.3 From ecdff893384cf169a55a66ca68c9d8917f8417a9 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 24 Feb 2025 19:44:13 +0200 Subject: ethtool: Symmetric OR-XOR RSS hash Add an additional type of symmetric RSS hash type: OR-XOR. The "Symmetric-OR-XOR" algorithm transforms the input as follows: (SRC_IP | DST_IP, SRC_IP ^ DST_IP, SRC_PORT | DST_PORT, SRC_PORT ^ DST_PORT) Change 'cap_rss_sym_xor_supported' to 'supported_input_xfrm', a bitmap of supported RXH_XFRM_* types. Reviewed-by: Cosmin Ratiu Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Edward Cree Link: https://patch.msgid.link/20250224174416.499070-2-gal@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 870994cc3ef7..7f222dccc7d1 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -763,13 +763,12 @@ struct kernel_ethtool_ts_info { /** * struct ethtool_ops - optional netdev operations + * @supported_input_xfrm: supported types of input xfrm from %RXH_XFRM_*. * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. * @cap_rss_ctx_supported: indicates if the driver supports RSS * contexts via legacy API, drivers implementing @create_rxfh_context * do not have to set this bit. - * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor - * RSS. * @rxfh_per_ctx_key: device supports setting different RSS key for each * additional context. Netlink API should report hfunc, key, and input_xfrm * for every context, not just context 0. @@ -995,9 +994,9 @@ struct kernel_ethtool_ts_info { * of the generic netdev features interface. */ struct ethtool_ops { + u32 supported_input_xfrm:8; u32 cap_link_lanes_supported:1; u32 cap_rss_ctx_supported:1; - u32 cap_rss_sym_xor_supported:1; u32 rxfh_per_ctx_key:1; u32 cap_rss_rxnfc_adds:1; u32 rxfh_indir_space; -- cgit v1.2.3 From a6aa36e957a1bfb5341986dec32d013d23228fe1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 14 Feb 2025 13:14:34 +0900 Subject: block: Remove zone write plugs when handling native zone append writes For devices that natively support zone append operations, REQ_OP_ZONE_APPEND BIOs are not processed through zone write plugging and are immediately issued to the zoned device. This means that there is no write pointer offset tracking done for these operations and that a zone write plug is not necessary. However, when receiving a zone append BIO, we may already have a zone write plug for the target zone if that zone was previously partially written using regular write operations. In such case, since the write pointer offset of the zone write plug is not incremented by the amount of sectors appended to the zone, 2 issues arise: 1) we risk leaving the plug in the disk hash table if the zone is fully written using zone append or regular write operations, because the write pointer offset will never reach the "zone full" state. 2) Regular write operations that are issued after zone append operations will always be failed by blk_zone_wplug_prepare_bio() as the write pointer alignment check will fail, even if the user correctly accounted for the zone append operations and issued the regular writes with a correct sector. Avoid these issues by immediately removing the zone write plug of zones that are the target of zone append operations when blk_zone_plug_bio() is called. The new function blk_zone_wplug_handle_native_zone_append() implements this for devices that natively support zone append. The removal of the zone write plug using disk_remove_zone_wplug() requires aborting all plugged regular write using disk_zone_wplug_abort() as otherwise the plugged write BIOs would never be executed (with the plug removed, the completion path will never see again the zone write plug as disk_get_zone_wplug() will return NULL). Rate-limited warnings are added to blk_zone_wplug_handle_native_zone_append() and to disk_zone_wplug_abort() to signal this. Since blk_zone_wplug_handle_native_zone_append() is called in the hot path for operations that will not be plugged, disk_get_zone_wplug() is optimized under the assumption that a user issuing zone append operations is not at the same time issuing regular writes and that there are no hashed zone write plugs. The struct gendisk atomic counter nr_zone_wplugs is added to check this, with this counter incremented in disk_insert_zone_wplug() and decremented in disk_remove_zone_wplug(). To be consistent with this fix, we do not need to fill the zone write plug hash table with zone write plugs for zones that are partially written for a device that supports native zone append operations. So modify blk_revalidate_seq_zone() to return early to avoid allocating and inserting a zone write plug for partially written sequential zones if the device natively supports zone append. Reported-by: Jorgen Hansen Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Tested-by: Jorgen Hansen Link: https://lore.kernel.org/r/20250214041434.82564-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 58ff5aca83b6..d37751789bf5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,10 +196,11 @@ struct gendisk { unsigned int zone_capacity; unsigned int last_zone_capacity; unsigned long __rcu *conv_zones_bitmap; - unsigned int zone_wplugs_hash_bits; - spinlock_t zone_wplugs_lock; + unsigned int zone_wplugs_hash_bits; + atomic_t nr_zone_wplugs; + spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; - struct hlist_head *zone_wplugs_hash; + struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ -- cgit v1.2.3 From 4e4136c6446753e6da4424a734f84de82c70600f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Tue, 25 Feb 2025 15:35:45 -0800 Subject: selftests/bpf: Test gen_pro/epilogue that generate kfuncs Test gen_prologue and gen_epilogue that generate kfuncs that have not been seen in the main program. The main bpf program and return value checks are identical to pro_epilogue.c introduced in commit 47e69431b57a ("selftests/bpf: Test gen_prologue and gen_epilogue"). However, now when bpf_testmod_st_ops detects a program name with prefix "test_kfunc_", it generates slightly different prologue and epilogue: They still add 1000 to args->a in prologue, add 10000 to args->a and set r0 to 2 * args->a in epilogue, but involve kfuncs. At high level, the alternative version of prologue and epilogue look like this: cgrp = bpf_cgroup_from_id(0); if (cgrp) bpf_cgroup_release(cgrp); else /* Perform what original bpf_testmod_st_ops prologue or * epilogue does */ Since 0 is never a valid cgroup id, the original prologue or epilogue logic will be performed. As a result, the __retval check should expect the exact same return value. Signed-off-by: Amery Hung Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20250225233545.285481-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index a3ea46281595..3ed6eb9e7c73 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -469,6 +469,16 @@ static inline bool insn_is_cast_user(const struct bpf_insn *insn) .off = 0, \ .imm = BPF_CALL_IMM(FUNC) }) +/* Kfunc call */ + +#define BPF_CALL_KFUNC(OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = BPF_PSEUDO_KFUNC_CALL, \ + .off = OFF, \ + .imm = IMM }) + /* Raw code statement block */ #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ -- cgit v1.2.3 From edd3e3b7d210747dec723edd2b6cb49d140c1256 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Feb 2025 09:47:56 -0500 Subject: iomap: rename iomap_iter processed field to status The iter.processed field name is no longer appropriate now that iomap operations do not return the number of bytes processed. Rename the field to iter.status to reflect that a success or error code is expected. Also change the type to int as there is no longer a need for an s64. This reduces the size of iomap_iter by 8 bytes due to a combination of smaller type and reduction in structure padding. While here, fix up the return types of various _iter() helpers to reflect the type change. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250224144757.237706-12-bfoster@redhat.com Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e180dacf434c..af9e51fba5f0 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -220,9 +220,8 @@ struct iomap_ops { * It is updated at the same time as @pos. * @iter_start_pos: The original start pos for the current iomap. Used for * incremental iter advance. - * @processed: The number of bytes the most recent iteration needs iomap_iter() - * to advance the iter, zero if the iter was already advanced, or a - * negative errno for an error during the operation. + * @status: Status of the most recent iteration. Zero on success or a negative + * errno on error. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -232,7 +231,7 @@ struct iomap_iter { loff_t pos; u64 len; loff_t iter_start_pos; - s64 processed; + int status; unsigned flags; struct iomap iomap; struct iomap srcmap; -- cgit v1.2.3 From d79c9cc512973ef6583c3bfc0b343f9d312d85b3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Feb 2025 09:47:57 -0500 Subject: iomap: introduce a full map advance helper Various iomap_iter_advance() calls advance by the full mapping length and thus have no need for the current length input or post-advance remaining length output from the standard advance function. Add an iomap_iter_advance_full() helper to clean up these cases. Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20250224144757.237706-13-bfoster@redhat.com Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/iomap.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index af9e51fba5f0..1fd66bc29cc1 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -271,6 +271,16 @@ static inline u64 iomap_length(const struct iomap_iter *iter) return iomap_length_trim(iter, iter->pos, iter->len); } +/** + * iomap_iter_advance_full - advance by the full length of current map + */ +static inline int iomap_iter_advance_full(struct iomap_iter *iter) +{ + u64 length = iomap_length(iter); + + return iomap_iter_advance(iter, &length); +} + /** * iomap_iter_srcmap - return the source map for the current iomap iteration * @i: iteration structure -- cgit v1.2.3 From 3ff6c8707c9a0116d00982851ec1216a42053ace Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 26 Feb 2025 17:18:31 +1100 Subject: nfs/vfs: discard d_exact_alias() d_exact_alias() is a descendent of d_add_unique() which was introduced 20 years ago mostly likely to work around problems with NFS servers of the time. It is now not used in several situations were it was originally needed and there have been no reports of problems - presumably the old NFS servers have been improved. This only place it is now use is in NFSv4 code and the old problematic servers are thought to have been v2/v3 only. There is no clear benefit in reusing a unhashed() dentry which happens to have the same name as the dentry we are adding. So this patch removes d_exact_alias() and the one place that it is used. Cc: Trond Myklebust Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250226062135.2043651-2-neilb@suse.de Signed-off-by: Christian Brauner --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4afb60365675..8a63978187a4 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -253,7 +253,6 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name); -extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); -- cgit v1.2.3 From 699ea5219c4b1d9d8819eb2d99e51a3fdb7b1d7b Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Wed, 12 Feb 2025 14:36:42 +0000 Subject: EDAC: Add a memory repair control feature Add a generic EDAC memory repair control driver to manage memory repairs in the system, such as CXL Post Package Repair (PPR) and other soft and hard PPR features. For example, a CXL device with DRAM components that support PPR features may implement PPR maintenance operations. DRAM components may support two types of PPR: - hard PPR, for a permanent row repair, and - soft PPR, for a temporary row repair. Soft PPR is much faster than hard PPR, but the repair is lost with a power cycle. When a CXL device detects an error in a memory, it may report the need for a repair maintenance operation by using an event record where the "maintenance needed" flag is set. The event records contain the device physical address (DPA) and other optional attributes of the memory to repair. The kernel will report the corresponding CXL general media or DRAM trace event to userspace, and userspace tools (e.g. rasdaemon) will initiate a repair operation in response to the device request via the sysfs repair control. Device with memory repair features registers with EDAC device driver, which retrieves a memory repair descriptor from EDAC memory repair driver and exposes the sysfs repair control attributes to userspace in /sys/bus/edac/devices//mem_repairX/. The common memory repair control interface abstracts the control of arbitrary memory repair functionality into a standardized set of functions. The sysfs memory repair attribute nodes are only available if the client driver has implemented the corresponding attribute callback function and provided operations to the EDAC device driver during registration. [ bp: Massage, fixup edac_dev_register() retvals, merge write_overflow fix to mem_repair_create_desc() ] Signed-off-by: Shiju Jose Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250212143654.1893-5-shiju.jose@huawei.com --- include/linux/edac.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index f8346014c14e..cfb2ef41ab95 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -668,6 +668,7 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, enum edac_dev_feat { RAS_FEAT_SCRUB, RAS_FEAT_ECS, + RAS_FEAT_MEM_REPAIR, RAS_FEAT_MAX }; @@ -743,11 +744,82 @@ static inline int edac_ecs_get_desc(struct device *ecs_dev, { return -EOPNOTSUPP; } #endif /* CONFIG_EDAC_ECS */ +enum edac_mem_repair_type { + EDAC_REPAIR_MAX +}; + +enum edac_mem_repair_cmd { + EDAC_DO_MEM_REPAIR = 1, +}; + +/** + * struct edac_mem_repair_ops - memory repair operations + * (all elements are optional except do_repair, set_hpa/set_dpa) + * @get_repair_type: get the memory repair type, listed in + * enum edac_mem_repair_function. + * @get_persist_mode: get the current persist mode. + * false - Soft repair type (temporary repair). + * true - Hard memory repair type (permanent repair). + * @set_persist_mode: set the persist mode of the memory repair instance. + * @get_repair_safe_when_in_use: get whether memory media is accessible and + * data is retained during repair operation. + * @get_hpa: get current host physical address (HPA) of memory to repair. + * @set_hpa: set host physical address (HPA) of memory to repair. + * @get_min_hpa: get the minimum supported host physical address (HPA). + * @get_max_hpa: get the maximum supported host physical address (HPA). + * @get_dpa: get current device physical address (DPA) of memory to repair. + * @set_dpa: set device physical address (DPA) of memory to repair. + * In some states of system configuration (e.g. before address decoders + * have been configured), memory devices (e.g. CXL) may not have an active + * mapping in the host physical address map. As such, the memory + * to repair must be identified by a device specific physical addressing + * scheme using a device physical address(DPA). The DPA and other control + * attributes to use for the repair operations will be presented in related + * error records. + * @get_min_dpa: get the minimum supported device physical address (DPA). + * @get_max_dpa: get the maximum supported device physical address (DPA). + * @get_nibble_mask: get current nibble mask of memory to repair. + * @set_nibble_mask: set nibble mask of memory to repair. + * @do_repair: Issue memory repair operation for the HPA/DPA and + * other control attributes set for the memory to repair. + * + * All elements are optional except do_repair and at least one of set_hpa/set_dpa. + */ +struct edac_mem_repair_ops { + int (*get_repair_type)(struct device *dev, void *drv_data, const char **type); + int (*get_persist_mode)(struct device *dev, void *drv_data, bool *persist); + int (*set_persist_mode)(struct device *dev, void *drv_data, bool persist); + int (*get_repair_safe_when_in_use)(struct device *dev, void *drv_data, bool *safe); + int (*get_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*set_hpa)(struct device *dev, void *drv_data, u64 hpa); + int (*get_min_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*get_max_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*get_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*set_dpa)(struct device *dev, void *drv_data, u64 dpa); + int (*get_min_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*get_max_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*get_nibble_mask)(struct device *dev, void *drv_data, u32 *val); + int (*set_nibble_mask)(struct device *dev, void *drv_data, u32 val); + int (*do_repair)(struct device *dev, void *drv_data, u32 val); +}; + +#if IS_ENABLED(CONFIG_EDAC_MEM_REPAIR) +int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance); +#else +static inline int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_MEM_REPAIR */ + /* EDAC device feature information structure */ struct edac_dev_data { union { const struct edac_scrub_ops *scrub_ops; const struct edac_ecs_ops *ecs_ops; + const struct edac_mem_repair_ops *mem_repair_ops; }; u8 instance; void *private; @@ -758,6 +830,7 @@ struct edac_dev_feat_ctx { void *private; struct edac_dev_data *scrub; struct edac_dev_data ecs; + struct edac_dev_data *mem_repair; }; struct edac_dev_feature { @@ -766,6 +839,7 @@ struct edac_dev_feature { union { const struct edac_scrub_ops *scrub_ops; const struct edac_ecs_ops *ecs_ops; + const struct edac_mem_repair_ops *mem_repair_ops; }; void *ctx; struct edac_ecs_ex_info ecs_info; -- cgit v1.2.3 From 007094c83872ed33c1d9e39b3ef7168d85a3f214 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 25 Feb 2025 10:52:10 +0100 Subject: gpiolib: use the required minimum set of headers Andy suggested we should keep a fine-grained scheme for includes and only pull in stuff required within individual ifdef sections. Let's revert commit dea69f2d1cc8 ("gpiolib: move all includes to the top of gpio/consumer.h") and make the headers situation even more fine-grained by only including the first level headers containing requireded symbols except for bug.h where checkpatch.pl warns against including asm/bug.h. Fixes: dea69f2d1cc8 ("gpiolib: move all includes to the top of gpio/consumer.h") Suggested-by: Andy Shevchenko Closes: https://lore.kernel.org/all/Z7XPcYtaA4COHDYj@smile.fi.intel.com/ Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250225095210.25910-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 0b2b56199c36..824a1717e6d2 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -3,10 +3,7 @@ #define __LINUX_GPIO_CONSUMER_H #include -#include #include -#include -#include #include struct acpi_device; @@ -185,6 +182,9 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, #else /* CONFIG_GPIOLIB */ +#include +#include + static inline int gpiod_count(struct device *dev, const char *con_id) { return 0; @@ -549,6 +549,9 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags); int gpiod_disable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags); #else + +#include + static inline int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags) { -- cgit v1.2.3 From 81e42fc1d3036efd45f66c03a79654fef00ef380 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Mon, 24 Feb 2025 12:13:40 +0100 Subject: EDAC: Update memory repair control interface for memory sparing feature Update memory repair control interface for memory sparing feature. CXL memory devices can support soft and hard memory sparing at cacheline, row, bank and rank granularities. Memory sparing is defined as a repair function that replaces a portion of memory with a portion of functional memory at that same granularity. When a CXL device detects an error in memory, it will report to the host that there's need for a repair maintenance operation by using an event record where the "maintenance needed" flag is set. The event records contain the device physical address (DPA) and other attributes of the memory to repair such as bank group, bank, rank, row, column, channel etc. The kernel will report the corresponding CXL general media or DRAM trace event to userspace, and userspace tools (e.g. rasdaemon) will initiate a repair operation in response to the device request via the sysfs repair control. [ bp: Massage. ] Signed-off-by: Shiju Jose Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250212143654.1893-15-shiju.jose@huawei.com --- include/linux/edac.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index cfb2ef41ab95..451f9c152c99 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -780,6 +780,20 @@ enum edac_mem_repair_cmd { * @get_max_dpa: get the maximum supported device physical address (DPA). * @get_nibble_mask: get current nibble mask of memory to repair. * @set_nibble_mask: set nibble mask of memory to repair. + * @get_bank_group: get current bank group of memory to repair. + * @set_bank_group: set bank group of memory to repair. + * @get_bank: get current bank of memory to repair. + * @set_bank: set bank of memory to repair. + * @get_rank: get current rank of memory to repair. + * @set_rank: set rank of memory to repair. + * @get_row: get current row of memory to repair. + * @set_row: set row of memory to repair. + * @get_column: get current column of memory to repair. + * @set_column: set column of memory to repair. + * @get_channel: get current channel of memory to repair. + * @set_channel: set channel of memory to repair. + * @get_sub_channel: get current subchannel of memory to repair. + * @set_sub_channel: set subchannel of memory to repair. * @do_repair: Issue memory repair operation for the HPA/DPA and * other control attributes set for the memory to repair. * @@ -800,6 +814,20 @@ struct edac_mem_repair_ops { int (*get_max_dpa)(struct device *dev, void *drv_data, u64 *dpa); int (*get_nibble_mask)(struct device *dev, void *drv_data, u32 *val); int (*set_nibble_mask)(struct device *dev, void *drv_data, u32 val); + int (*get_bank_group)(struct device *dev, void *drv_data, u32 *val); + int (*set_bank_group)(struct device *dev, void *drv_data, u32 val); + int (*get_bank)(struct device *dev, void *drv_data, u32 *val); + int (*set_bank)(struct device *dev, void *drv_data, u32 val); + int (*get_rank)(struct device *dev, void *drv_data, u32 *val); + int (*set_rank)(struct device *dev, void *drv_data, u32 val); + int (*get_row)(struct device *dev, void *drv_data, u32 *val); + int (*set_row)(struct device *dev, void *drv_data, u32 val); + int (*get_column)(struct device *dev, void *drv_data, u32 *val); + int (*set_column)(struct device *dev, void *drv_data, u32 val); + int (*get_channel)(struct device *dev, void *drv_data, u32 *val); + int (*set_channel)(struct device *dev, void *drv_data, u32 val); + int (*get_sub_channel)(struct device *dev, void *drv_data, u32 *val); + int (*set_sub_channel)(struct device *dev, void *drv_data, u32 val); int (*do_repair)(struct device *dev, void *drv_data, u32 val); }; -- cgit v1.2.3 From 8ce258f62f90cb2d339cc39fa43e5634594a9dfb Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 20 Feb 2025 10:56:59 +0100 Subject: gpiolib: make value setters have return values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change the in-kernel consumer interface for GPIOs: make all variants of value setters that don't have a return value, return a signed integer instead. That will allow these routines to indicate failures to callers. This doesn't change the implementation just yet, we'll do it in subsequent commits. We need to update the gpio-latch module as it passes the address of value setters as a function pointer argument and thus cares about its type. Reviewed-by: Linus Walleij Acked-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20250220-gpio-set-retval-v2-2-bc4cfd38dae3@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio.h | 4 ++-- include/linux/gpio/consumer.h | 22 +++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 6270150f4e29..c1ec62c11ed3 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -91,7 +91,7 @@ static inline int gpio_get_value_cansleep(unsigned gpio) } static inline void gpio_set_value_cansleep(unsigned gpio, int value) { - return gpiod_set_raw_value_cansleep(gpio_to_desc(gpio), value); + gpiod_set_raw_value_cansleep(gpio_to_desc(gpio), value); } static inline int gpio_get_value(unsigned gpio) @@ -100,7 +100,7 @@ static inline int gpio_get_value(unsigned gpio) } static inline void gpio_set_value(unsigned gpio, int value) { - return gpiod_set_raw_value(gpio_to_desc(gpio), value); + gpiod_set_raw_value(gpio_to_desc(gpio), value); } static inline int gpio_to_irq(unsigned gpio) diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 824a1717e6d2..45b651c05b9c 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -118,7 +118,7 @@ int gpiod_get_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap); -void gpiod_set_value(struct gpio_desc *desc, int value); +int gpiod_set_value(struct gpio_desc *desc, int value); int gpiod_set_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, @@ -128,7 +128,7 @@ int gpiod_get_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap); -void gpiod_set_raw_value(struct gpio_desc *desc, int value); +int gpiod_set_raw_value(struct gpio_desc *desc, int value); int gpiod_set_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, @@ -140,7 +140,7 @@ int gpiod_get_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap); -void gpiod_set_value_cansleep(struct gpio_desc *desc, int value); +int gpiod_set_value_cansleep(struct gpio_desc *desc, int value); int gpiod_set_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, @@ -150,7 +150,7 @@ int gpiod_get_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap); -void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value); +int gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value); int gpiod_set_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, @@ -360,10 +360,11 @@ static inline int gpiod_get_array_value(unsigned int array_size, WARN_ON(desc_array); return 0; } -static inline void gpiod_set_value(struct gpio_desc *desc, int value) +static inline int gpiod_set_value(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ WARN_ON(desc); + return 0; } static inline int gpiod_set_array_value(unsigned int array_size, struct gpio_desc **desc_array, @@ -389,10 +390,11 @@ static inline int gpiod_get_raw_array_value(unsigned int array_size, WARN_ON(desc_array); return 0; } -static inline void gpiod_set_raw_value(struct gpio_desc *desc, int value) +static inline int gpiod_set_raw_value(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ WARN_ON(desc); + return 0; } static inline int gpiod_set_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, @@ -419,10 +421,11 @@ static inline int gpiod_get_array_value_cansleep(unsigned int array_size, WARN_ON(desc_array); return 0; } -static inline void gpiod_set_value_cansleep(struct gpio_desc *desc, int value) +static inline int gpiod_set_value_cansleep(struct gpio_desc *desc, int value) { /* GPIO can never have been requested */ WARN_ON(desc); + return 0; } static inline int gpiod_set_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, @@ -448,11 +451,12 @@ static inline int gpiod_get_raw_array_value_cansleep(unsigned int array_size, WARN_ON(desc_array); return 0; } -static inline void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, - int value) +static inline int gpiod_set_raw_value_cansleep(struct gpio_desc *desc, + int value) { /* GPIO can never have been requested */ WARN_ON(desc); + return 0; } static inline int gpiod_set_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, -- cgit v1.2.3 From 98ce1eb1fd87ea1b016e0913ef6836ab0139b5c4 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 20 Feb 2025 10:57:02 +0100 Subject: gpiolib: introduce gpio_chip setters that return values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new variants of the set() and set_multiple() callbacks that have integer return values allowing to indicate failures to users of the GPIO consumer API. Until we convert all GPIO providers treewide to using them, they will live in parallel to the existing ones. Make sure that providers cannot define both. Prefer the new ones and only use the old ones as fallback. Reviewed-by: Linus Walleij Acked-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20250220-gpio-set-retval-v2-5-bc4cfd38dae3@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index ae96a2f260fb..a2a1b6434321 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -348,6 +348,10 @@ struct gpio_irq_chip { * stores them in "bits", returns 0 on success or negative error * @set: assigns output value for signal "offset" * @set_multiple: assigns output values for multiple signals defined by "mask" + * @set_rv: assigns output value for signal "offset", returns 0 on success or + * negative error value + * @set_multiple_rv: assigns output values for multiple signals defined by + * "mask", returns 0 on success or negative error value * @set_config: optional hook for all kinds of settings. Uses the same * packed config format as generic pinconf. Must return 0 on success and * a negative error number on failure. @@ -445,6 +449,12 @@ struct gpio_chip { void (*set_multiple)(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits); + int (*set_rv)(struct gpio_chip *gc, + unsigned int offset, + int value); + int (*set_multiple_rv)(struct gpio_chip *gc, + unsigned long *mask, + unsigned long *bits); int (*set_config)(struct gpio_chip *gc, unsigned int offset, unsigned long config); -- cgit v1.2.3 From 9a54fb31343362f93680543e37afc14484c185d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2025 13:37:04 +0100 Subject: x86/cfi: Add 'cfi=warn' boot option Rebuilding with CONFIG_CFI_PERMISSIVE=y enabled is such a pain, esp. since clang is so slow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250224124159.924496481@infradead.org --- include/linux/cfi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cfi.h b/include/linux/cfi.h index f0df518e11dd..1db17ecbb86c 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,6 +11,8 @@ #include #include +extern bool cfi_warn; + #ifndef cfi_get_offset static inline int cfi_get_offset(void) { -- cgit v1.2.3 From 4ff6039ffb79a4a8a44b63810a8a2f2b43264856 Mon Sep 17 00:00:00 2001 From: Yuanfang Zhang Date: Thu, 16 Jan 2025 17:04:20 +0800 Subject: coresight-etm4x: add isb() before reading the TRCSTATR As recommended by section 4.3.7 ("Synchronization when using system instructions to progrom the trace unit") of ARM IHI 0064H.b, the self-hosted trace analyzer must perform a Context synchronization event between writing to the TRCPRGCTLR and reading the TRCSTATR. Additionally, add an ISB between the each read of TRCSTATR on coresight_timeout() when using system instructions to program the trace unit. Fixes: 1ab3bb9df5e3 ("coresight: etm4x: Add necessary synchronization for sysreg access") Signed-off-by: Yuanfang Zhang Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250116-etm_sync-v4-1-39f2b05e9514@quicinc.com --- include/linux/coresight.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 3eef4e91df3f..c7614ccb3232 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -661,6 +661,10 @@ extern int coresight_enable_sysfs(struct coresight_device *csdev); extern void coresight_disable_sysfs(struct coresight_device *csdev); extern int coresight_timeout(struct csdev_access *csa, u32 offset, int position, int value); +typedef void (*coresight_timeout_cb_t) (struct csdev_access *, u32, int, int); +extern int coresight_timeout_action(struct csdev_access *csa, u32 offset, + int position, int value, + coresight_timeout_cb_t cb); extern int coresight_claim_device(struct coresight_device *csdev); extern int coresight_claim_device_unlocked(struct coresight_device *csdev); -- cgit v1.2.3 From 7ebd85022c0075c4465a51e8ace4e08dfce747b1 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 24 Feb 2025 01:06:10 +0000 Subject: PM: clk: remove unused of_pm_clk_add_clk() The last use of of_pm_clk_add_clk() was removed by 2019's commit fe00f8900ca7 ("irqchip/gic-pm: Update driver to use clk_bulk APIs") Remove it. Note that the plural version of_pm_clk_add_clks() is still being used and is left. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250224010610.187503-1-linux@treblig.org Signed-off-by: Rafael J. Wysocki --- include/linux/pm_clock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index 68669ce18720..45c3f3ccbaf8 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -41,7 +41,6 @@ extern int pm_clk_create(struct device *dev); extern void pm_clk_destroy(struct device *dev); extern int pm_clk_add(struct device *dev, const char *con_id); extern int pm_clk_add_clk(struct device *dev, struct clk *clk); -extern int of_pm_clk_add_clk(struct device *dev, const char *name); extern int of_pm_clk_add_clks(struct device *dev); extern void pm_clk_remove(struct device *dev, const char *con_id); extern void pm_clk_remove_clk(struct device *dev, struct clk *clk); -- cgit v1.2.3 From b2aba529bf77ebdc1a1841b884ff841c1d21f6af Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 24 Feb 2025 15:55:42 -0800 Subject: KVM: Drop kvm_arch_sync_events() now that all implementations are nops Remove kvm_arch_sync_events() now that x86 no longer uses it (no other arch has ever used it). No functional change intended. Signed-off-by: Sean Christopherson Acked-by: Claudio Imbrenda Reviewed-by: Bibo Mao Message-ID: <20250224235542.2562848-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f34f4cfaa513..ba6f29872ab2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1746,7 +1746,6 @@ static inline void kvm_unregister_perf_callbacks(void) {} int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); -void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 9ec84f79c5a7a65cd69b5b705a203759665160cd Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Mon, 23 Dec 2024 07:06:49 +0000 Subject: perf: Remove unnecessary parameter of security check It seems that the attr parameter was never been used in security checks since it was first introduced by: commit da97e18458fb ("perf_event: Add support for LSM and SELinux checks") so remove it. Signed-off-by: Luo Gengkun Reviewed-by: Ingo Molnar Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/perf_event.h | 10 +++++----- include/linux/security.h | 5 ++--- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 9eb313bd0c93..2bf909fa3394 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -445,7 +445,7 @@ LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) #ifdef CONFIG_PERF_EVENTS -LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type) +LSM_HOOK(int, 0, perf_event_open, int type) LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event) LSM_HOOK(int, 0, perf_event_read, struct perf_event *event) LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8333f132f4a9..5d2ec4283ebf 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1672,22 +1672,22 @@ static inline int perf_is_paranoid(void) return sysctl_perf_event_paranoid > -1; } -int perf_allow_kernel(struct perf_event_attr *attr); +int perf_allow_kernel(void); -static inline int perf_allow_cpu(struct perf_event_attr *attr) +static inline int perf_allow_cpu(void) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; - return security_perf_event_open(attr, PERF_SECURITY_CPU); + return security_perf_event_open(PERF_SECURITY_CPU); } -static inline int perf_allow_tracepoint(struct perf_event_attr *attr) +static inline int perf_allow_tracepoint(void) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; - return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); + return security_perf_event_open(PERF_SECURITY_TRACEPOINT); } extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); diff --git a/include/linux/security.h b/include/linux/security.h index 27f64a9747f8..1545d515a66b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2324,14 +2324,13 @@ struct perf_event_attr; struct perf_event; #ifdef CONFIG_SECURITY -extern int security_perf_event_open(struct perf_event_attr *attr, int type); +extern int security_perf_event_open(int type); extern int security_perf_event_alloc(struct perf_event *event); extern void security_perf_event_free(struct perf_event *event); extern int security_perf_event_read(struct perf_event *event); extern int security_perf_event_write(struct perf_event *event); #else -static inline int security_perf_event_open(struct perf_event_attr *attr, - int type) +static inline int security_perf_event_open(int type) { return 0; } -- cgit v1.2.3 From eb50844d728f11e87491f7c7af15a4a737f1159d Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 25 Feb 2025 21:58:06 +0800 Subject: of: property: Increase NR_FWNODE_REFERENCE_ARGS Currently, the following two macros have different values: // The maximal argument count for firmware node reference #define NR_FWNODE_REFERENCE_ARGS 8 // The maximal argument count for DT node reference #define MAX_PHANDLE_ARGS 16 It may cause firmware node reference's argument count out of range if directly assign DT node reference's argument count to firmware's. drivers/of/property.c:of_fwnode_get_reference_args() is doing the direct assignment, so may cause firmware's argument count @args->nargs got out of range, namely, in [9, 16]. Fix by increasing NR_FWNODE_REFERENCE_ARGS to 16 to meet DT requirement. Will align both macros later to avoid such inconsistency. Fixes: 3e3119d3088f ("device property: Introduce fwnode_property_get_reference_args") Signed-off-by: Zijun Hu Acked-by: Sakari Ailus Link: https://lore.kernel.org/r/20250225-fix_arg_count-v4-1-13cdc519eb31@quicinc.com Signed-off-by: Rob Herring (Arm) --- include/linux/fwnode.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 0731994b9d7c..6fa0a268d538 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -91,7 +91,7 @@ struct fwnode_endpoint { #define SWNODE_GRAPH_PORT_NAME_FMT "port@%u" #define SWNODE_GRAPH_ENDPOINT_NAME_FMT "endpoint@%u" -#define NR_FWNODE_REFERENCE_ARGS 8 +#define NR_FWNODE_REFERENCE_ARGS 16 /** * struct fwnode_reference_args - Fwnode reference with additional arguments -- cgit v1.2.3 From 2ac95560fbe1946f0faf51d8db62f6f2b67ee5a3 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 25 Feb 2025 21:58:07 +0800 Subject: of: Align macro MAX_PHANDLE_ARGS with NR_FWNODE_REFERENCE_ARGS Macro NR_FWNODE_REFERENCE_ARGS defines the maximal argument count for firmware node reference, and MAX_PHANDLE_ARGS defines the maximal argument count for DT node reference, both have the same value now. To void argument count inconsistency between firmware and DT, simply align both macros by '#define MAX_PHANDLE_ARGS NR_FWNODE_REFERENCE_ARGS'. Signed-off-by: Zijun Hu Reviewed-by: Sakari Ailus Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250225-fix_arg_count-v4-2-13cdc519eb31@quicinc.com Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index eaf0e2a2b75c..86bf8f073111 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -67,7 +67,7 @@ struct device_node { #endif }; -#define MAX_PHANDLE_ARGS 16 +#define MAX_PHANDLE_ARGS NR_FWNODE_REFERENCE_ARGS struct of_phandle_args { struct device_node *np; int args_count; -- cgit v1.2.3 From 84b25926fa7abb5b634d4af9544f47ab0aabc399 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 26 Feb 2025 09:21:18 -0700 Subject: acpi: numa: Add support to enumerate and store extended linear address mode Store the address mode as part of the cache attriutes. Export the mode attribute to sysfs as all other cache attributes. Link: https://lore.kernel.org/linux-cxl/668333b17e4b2_5639294fd@dwillia2-xfh.jf.intel.com.notmuch/ Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250226162224.3633792-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- include/linux/node.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/node.h b/include/linux/node.h index 9a881c2208b3..2b7517892230 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -57,6 +57,11 @@ enum cache_write_policy { NODE_CACHE_WRITE_OTHER, }; +enum cache_mode { + NODE_CACHE_ADDR_MODE_RESERVED, + NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR, +}; + /** * struct node_cache_attrs - system memory caching attributes * @@ -65,6 +70,7 @@ enum cache_write_policy { * @size: Total size of cache in bytes * @line_size: Number of bytes fetched on a cache miss * @level: The cache hierarchy level + * @address_mode: The address mode */ struct node_cache_attrs { enum cache_indexing indexing; @@ -72,6 +78,7 @@ struct node_cache_attrs { u64 size; u16 line_size; u8 level; + u16 address_mode; }; #ifdef CONFIG_HMEM_REPORTING -- cgit v1.2.3 From 0ec9849b63338da7883440ed3f52757cd8c847b1 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 26 Feb 2025 09:21:19 -0700 Subject: acpi/hmat / cxl: Add extended linear cache support for CXL The current cxl region size only indicates the size of the CXL memory region without accounting for the extended linear cache size. Retrieve the cache size from HMAT and append that to the cxl region size for the cxl region range that matches the SRAT range that has extended linear cache enabled. The SRAT defines the whole memory range that includes the extended linear cache and the CXL memory region. The new HMAT ECN/ECR to the Memory Side Cache Information Structure defines the size of the extended linear cache size and matches to the SRAT Memory Affinity Structure by the memory proxmity domain. Add a helper to match the cxl range to the SRAT memory range in order to retrieve the cache size. There are several places that checks the cxl region range against the decoder range. Use new helper to check between the two ranges and address the new cache size. Reviewed-by: Jonathan Cameron Reviewed-by: Li Ming Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20250226162224.3633792-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- include/linux/acpi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4e495b29c640..cbd933504dbf 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1095,6 +1095,17 @@ static inline acpi_handle acpi_get_processor_handle(int cpu) #endif /* !CONFIG_ACPI */ +#ifdef CONFIG_ACPI_HMAT +int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, + resource_size_t *size); +#else +static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *size) +{ + return -EOPNOTSUPP; +} +#endif + extern void arch_post_acpi_subsys_init(void); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC -- cgit v1.2.3 From 57e5cc9b8a399f85c95b6249ecb423553aad209c Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 21 Feb 2025 14:02:25 +0000 Subject: KVM: arm64: Specify hypercall ABI for retrieving target implementations If the Guest requires migration to multiple targets, these hypercalls will provide a way to retrieve the target CPU implementations from the user space VMM. Subsequent patch will use this to enable the associated errata. Suggested-by: Oliver Upton Suggested-by: Marc Zyngier Reviewed-by: Cornelia Huck Reviewed-by: Sebastian Ott Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250221140229.12588-3-shameerali.kolothum.thodi@huawei.com Signed-off-by: Oliver Upton --- include/linux/arm-smccc.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 67f6fdf2e7cd..f19be5754090 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -179,6 +179,9 @@ #define ARM_SMCCC_KVM_FUNC_PKVM_RESV_62 62 #define ARM_SMCCC_KVM_FUNC_PKVM_RESV_63 63 /* End of pKVM hypercall range */ +#define ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_VER 64 +#define ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_CPUS 65 + #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -225,6 +228,18 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_MMIO_GUARD) +#define ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_VER_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_VER) + +#define ARM_SMCCC_VENDOR_HYP_KVM_DISCOVER_IMPL_CPUS_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_DISCOVER_IMPL_CPUS) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 -- cgit v1.2.3 From 3ba075278c11cdb19e2dbb80362042f1b0c08f74 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Feb 2025 17:10:48 +0000 Subject: tcp: be less liberal in TSEcr received while in SYN_RECV state Yong-Hao Zou mentioned that linux was not strict as other OS in 3WHS, for flows using TCP TS option (RFC 7323) As hinted by an old comment in tcp_check_req(), we can check the TSEcr value in the incoming packet corresponds to one of the SYNACK TSval values we have sent. In this patch, I record the oldest and most recent values that SYNACK packets have used. Send a challenge ACK if we receive a TSEcr outside of this range, and increase a new SNMP counter. nstat -az | grep TSEcrRejected TcpExtTSEcrRejected 0 0.0 Due to TCP fastopen implementation, do not apply yet these checks for fastopen flows. v2: No longer use req->num_timeout, but treq->snt_tsval_first to detect when first SYNACK is prepared. This means we make sure to not send an initial zero TSval. Make sure MPTCP and TCP selftests are passing. Change MIB name to TcpExtTSEcrRejected v1: https://lore.kernel.org/netdev/CADVnQykD8i4ArpSZaPKaoNxLJ2if2ts9m4As+=Jvdkrgx1qMHw@mail.gmail.com/T/ Reported-by: Yong-Hao Zou Signed-off-by: Eric Dumazet Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250225171048.3105061-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f88daaa76d83..159b2c59eb62 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -160,6 +160,8 @@ struct tcp_request_sock { u32 rcv_isn; u32 snt_isn; u32 ts_off; + u32 snt_tsval_first; + u32 snt_tsval_last; u32 last_oow_ack_time; /* last SYNACK */ u32 rcv_nxt; /* the ack # by SYNACK. For * FastOpen it's the seq# -- cgit v1.2.3 From e6116fc605574bb58c2016938ff24a7fbafe6e2a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Feb 2025 21:33:55 -0500 Subject: net: skb: free up one bit in tx_flags The linked series wants to add skb tx completion timestamps. That needs a bit in skb_shared_info.tx_flags, but all are in use. A per-skb bit is only needed for features that are configured on a per packet basis. Per socket features can be read from sk->sk_tsflags. Per packet tsflags can be set in sendmsg using cmsg, but only those in SOF_TIMESTAMPING_TX_RECORD_MASK. Per packet tsflags can also be set without cmsg by sandwiching a send inbetween two setsockopts: val |= SOF_TIMESTAMPING_$FEATURE; setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); write(fd, buf, sz); val &= ~SOF_TIMESTAMPING_$FEATURE; setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)); Changing a datapath test from skb_shinfo(skb)->tx_flags to skb->sk->sk_tsflags can change behavior in that case, as the tx_flags is written before the second setsockopt updates sk_tsflags. Therefore, only bits can be reclaimed that cannot be set by cmsg and are also highly unlikely to be used to target individual packets otherwise. Free up the bit currently used for SKBTX_HW_TSTAMP_USE_CYCLES. This selects between clock and free running counter source for HW TX timestamps. It is probable that all packets of the same socket will always use the same source. Link: https://lore.kernel.org/netdev/cover.1739988644.git.pav@iki.fi/ Signed-off-by: Willem de Bruijn Reviewed-by: Jason Xing Reviewed-by: Gerhard Engleder Link: https://patch.msgid.link/20250225023416.2088705-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f2bb8473d99a..171aa15f6541 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -478,8 +478,8 @@ enum { /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, - /* generate hardware time stamp based on cycles if supported */ - SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, + /* reserved */ + SKBTX_RESERVED = 1 << 3, /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, @@ -500,7 +500,6 @@ enum { SKBTX_SCHED_TSTAMP | \ SKBTX_BPF) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ - SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ -- cgit v1.2.3 From bd7c00605ee0cf0bf27764d5da0b948d8004229e Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Mon, 24 Feb 2025 16:22:22 -0700 Subject: net: move aRFS rmap management and CPU affinity to core A common task for most drivers is to remember the user-set CPU affinity to its IRQs. On each netdev reset, the driver should re-assign the user's settings to the IRQs. Unify this task across all drivers by moving the CPU affinity to napi->config. However, to move the CPU affinity to core, we also need to move aRFS rmap management since aRFS uses its own IRQ notifiers. For the aRFS, add a new netdev flag "rx_cpu_rmap_auto". Drivers supporting aRFS should set the flag via netif_enable_cpu_rmap() and core will allocate and manage the aRFS rmaps. Freeing the rmap is also done by core when the netdev is freed. For better IRQ affinity management, move the IRQ rmap notifier inside the napi_struct and add new notify.notify and notify.release functions: netif_irq_cpu_rmap_notify() and netif_napi_affinity_release(). Now we have the aRFS rmap management in core, add CPU affinity mask to napi_config. To delegate the CPU affinity management to the core, drivers must: 1 - set the new netdev flag "irq_affinity_auto": netif_enable_irq_affinity(netdev) 2 - create the napi with persistent config: netif_napi_add_config() 3 - bind an IRQ to the napi instance: netif_napi_set_irq() the core will then make sure to use re-assign affinity to the napi's IRQ. The default IRQ mask is set to one cpu starting from the closest NUMA. Signed-off-by: Ahmed Zaki Link: https://patch.msgid.link/20250224232228.990783-2-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/cpu_rmap.h | 1 + include/linux/netdevice.h | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h index 20b5729903d7..2fd7ba75362a 100644 --- a/include/linux/cpu_rmap.h +++ b/include/linux/cpu_rmap.h @@ -32,6 +32,7 @@ struct cpu_rmap { #define CPU_RMAP_DIST_INF 0xffff extern struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags); +extern void cpu_rmap_get(struct cpu_rmap *rmap); extern int cpu_rmap_put(struct cpu_rmap *rmap); extern int cpu_rmap_add(struct cpu_rmap *rmap, void *obj); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9a387d456592..2094d3edda73 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -352,6 +352,7 @@ struct napi_config { u64 gro_flush_timeout; u64 irq_suspend_timeout; u32 defer_hard_irqs; + cpumask_t affinity_mask; unsigned int napi_id; }; @@ -394,6 +395,8 @@ struct napi_struct { struct list_head dev_list; struct hlist_node napi_hash_node; int irq; + struct irq_affinity_notify notify; + int napi_rmap_idx; int index; struct napi_config *config; }; @@ -409,6 +412,7 @@ enum { NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ + NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ }; enum { @@ -422,6 +426,7 @@ enum { NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), + NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), }; enum gro_result { @@ -1989,6 +1994,15 @@ enum netdev_reg_state { * * @threaded: napi threaded mode is enabled * + * @irq_affinity_auto: driver wants the core to store and re-assign the IRQ + * affinity. Set by netif_enable_irq_affinity(), then + * the driver must create a persistent napi by + * netif_napi_add_config() and finally bind the napi to + * IRQ (via netif_napi_set_irq()). + * + * @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap. + * Set by calling netif_enable_cpu_rmap(). + * * @see_all_hwtstamp_requests: device wants to see calls to * ndo_hwtstamp_set() for all timestamp requests * regardless of source, even if those aren't @@ -2396,6 +2410,8 @@ struct net_device { struct lock_class_key *qdisc_tx_busylock; bool proto_down; bool threaded; + bool irq_affinity_auto; + bool rx_cpu_rmap_auto; /* priv_flags_slow, ungrouped to save space */ unsigned long see_all_hwtstamp_requests:1; @@ -2724,10 +2740,7 @@ static inline void netdev_assert_locked_or_invisible(struct net_device *dev) netdev_assert_locked(dev); } -static inline void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) -{ - napi->irq = irq; -} +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq); static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) { @@ -2865,6 +2878,9 @@ static inline void netif_napi_del(struct napi_struct *napi) synchronize_net(); } +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs); +void netif_set_affinity_auto(struct net_device *dev); + struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; -- cgit v1.2.3 From 0da30874729baeb01889b0eca16cfda122687503 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 24 Feb 2025 16:04:17 +0200 Subject: dmaengine: ti: k3-udma-glue: Drop skip_fdq argument from k3_udma_glue_reset_rx_chn The user of k3_udma_glue_reset_rx_chn() e.g. ti_am65_cpsw_nuss can run on multiple platforms having different DMA architectures. On some platforms there can be one FDQ for all flows in the RX channel while for others there is a separate FDQ for each flow in the RX channel. So far we have been relying on the skip_fdq argument of k3_udma_glue_reset_rx_chn(). Instead of relying on the user to provide this information, infer it based on DMA architecture during k3_udma_glue_request_rx_chn() and save it in an internal flag 'single_fdq'. Use that flag at k3_udma_glue_reset_rx_chn() to deicide if the FDQ needs to be cleared for every flow or just for flow 0. Fixes the below issue on ti_am65_cpsw_nuss driver on AM62-SK. > ip link set eth1 down > ip link set eth0 down > ethtool -L eth0 rx 8 > ip link set eth0 up > modprobe -r ti_am65_cpsw_nuss [ 103.045726] ------------[ cut here ]------------ [ 103.050505] k3_knav_desc_pool size 512000 != avail 64000 [ 103.050703] WARNING: CPU: 1 PID: 450 at drivers/net/ethernet/ti/k3-cppi-desc-pool.c:33 k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool] [ 103.068810] Modules linked in: ti_am65_cpsw_nuss(-) k3_cppi_desc_pool snd_soc_hdmi_codec crct10dif_ce snd_soc_simple_card snd_soc_simple_card_utils display_connector rtc_ti_k3 k3_j72xx_bandgap tidss drm_client_lib snd_soc_davinci_mcas p drm_dma_helper tps6598x phylink snd_soc_ti_udma rti_wdt drm_display_helper snd_soc_tlv320aic3x_i2c typec at24 phy_gmii_sel snd_soc_ti_edma snd_soc_tlv320aic3x sii902x snd_soc_ti_sdma sa2ul omap_mailbox drm_kms_helper authenc cfg80211 r fkill fuse drm drm_panel_orientation_quirks backlight ip_tables x_tables ipv6 [last unloaded: k3_cppi_desc_pool] [ 103.119950] CPU: 1 UID: 0 PID: 450 Comm: modprobe Not tainted 6.13.0-rc7-00001-g9c5e3435fa66 #1011 [ 103.119968] Hardware name: Texas Instruments AM625 SK (DT) [ 103.119974] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 103.119983] pc : k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool] [ 103.148007] lr : k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool] [ 103.154709] sp : ffff8000826ebbc0 [ 103.158015] x29: ffff8000826ebbc0 x28: ffff0000090b6300 x27: 0000000000000000 [ 103.165145] x26: 0000000000000000 x25: 0000000000000000 x24: ffff0000019df6b0 [ 103.172271] x23: ffff0000019df6b8 x22: ffff0000019df410 x21: ffff8000826ebc88 [ 103.179397] x20: 000000000007d000 x19: ffff00000a3b3000 x18: 0000000000000000 [ 103.186522] x17: 0000000000000000 x16: 0000000000000000 x15: 000001e8c35e1cde [ 103.193647] x14: 0000000000000396 x13: 000000000000035c x12: 0000000000000000 [ 103.200772] x11: 000000000000003a x10: 00000000000009c0 x9 : ffff8000826eba20 [ 103.207897] x8 : ffff0000090b6d20 x7 : ffff00007728c180 x6 : ffff00007728c100 [ 103.215022] x5 : 0000000000000001 x4 : ffff000000508a50 x3 : ffff7ffff6146000 [ 103.222147] x2 : 0000000000000000 x1 : e300b4173ee6b200 x0 : 0000000000000000 [ 103.229274] Call trace: [ 103.231714] k3_cppi_desc_pool_destroy+0xa0/0xa8 [k3_cppi_desc_pool] (P) [ 103.238408] am65_cpsw_nuss_free_rx_chns+0x28/0x4c [ti_am65_cpsw_nuss] [ 103.244942] devm_action_release+0x14/0x20 [ 103.249040] release_nodes+0x3c/0x68 [ 103.252610] devres_release_all+0x8c/0xdc [ 103.256614] device_unbind_cleanup+0x18/0x60 [ 103.260876] device_release_driver_internal+0xf8/0x178 [ 103.266004] driver_detach+0x50/0x9c [ 103.269571] bus_remove_driver+0x6c/0xbc [ 103.273485] driver_unregister+0x30/0x60 [ 103.277401] platform_driver_unregister+0x14/0x20 [ 103.282096] am65_cpsw_nuss_driver_exit+0x18/0xff4 [ti_am65_cpsw_nuss] [ 103.288620] __arm64_sys_delete_module+0x17c/0x25c [ 103.293404] invoke_syscall+0x44/0x100 [ 103.297149] el0_svc_common.constprop.0+0xc0/0xe0 [ 103.301845] do_el0_svc+0x1c/0x28 [ 103.305155] el0_svc+0x28/0x98 [ 103.308207] el0t_64_sync_handler+0xc8/0xcc [ 103.312384] el0t_64_sync+0x198/0x19c [ 103.316040] ---[ end trace 0000000000000000 ]--- Signed-off-by: Roger Quadros Acked-by: Jakub Kicinski Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20250224-k3-udma-glue-single-fdq-v2-1-cbe7621f2507@kernel.org Signed-off-by: Vinod Koul --- include/linux/dma/k3-udma-glue.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h index 2dea217629d0..5d43881e6fb7 100644 --- a/include/linux/dma/k3-udma-glue.h +++ b/include/linux/dma/k3-udma-glue.h @@ -138,8 +138,7 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_num); void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_num, void *data, - void (*cleanup)(void *data, dma_addr_t desc_dma), - bool skip_fdq); + void (*cleanup)(void *data, dma_addr_t desc_dma)); int k3_udma_glue_rx_flow_enable(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_idx); int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn, -- cgit v1.2.3 From b2cd5ae693a3dc5b70a0f75fba96452c591a2047 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 4 Feb 2025 11:39:59 -0700 Subject: iomap: make buffered writes work with RWF_DONTCACHE Add iomap buffered write support for RWF_DONTCACHE. If RWF_DONTCACHE is set for a write, mark the folios being written as uncached. Then writeback completion will drop the pages. The write_iter handler simply kicks off writeback for the pages, and writeback completion will take care of the rest. Signed-off-by: "Darrick J. Wong" Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250204184047.356762-2-axboe@kernel.dk Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..26b0dbe23e62 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -183,6 +183,7 @@ struct iomap_folio_ops { #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ #define IOMAP_ATOMIC (1 << 9) +#define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { /* -- cgit v1.2.3 From 291515c7640962f8865e4c54897a5e91526b450c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 25 Feb 2025 18:17:43 +0100 Subject: net: gro: decouple GRO from the NAPI layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In fact, these two are not tied closely to each other. The only requirements to GRO are to use it in the BH context and have some sane limits on the packet batches, e.g. NAPI has a limit of its budget (64/8/etc.). Move purely GRO fields into a new structure, &gro_node. Embed it into &napi_struct and adjust all the references. gro_node::cached_napi_id is effectively the same as napi_struct::napi_id, but to be used on GRO hotpath to mark skbs. napi_struct::napi_id is now a fully control path field. Three Ethernet drivers use napi_gro_flush() not really meant to be exported, so move it to and add that include there. napi_gro_receive() is used in more than 100 drivers, keep it in . This does not make GRO ready to use outside of the NAPI context yet. Tested-by: Daniel Xu Acked-by: Jakub Kicinski Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2094d3edda73..26a0c4e4d963 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -340,11 +340,27 @@ struct gro_list { }; /* - * size of gro hash buckets, must less than bit number of - * napi_struct::gro_bitmask + * size of gro hash buckets, must be <= the number of bits in + * gro_node::bitmask */ #define GRO_HASH_BUCKETS 8 +/** + * struct gro_node - structure to support Generic Receive Offload + * @bitmask: bitmask to indicate used buckets in @hash + * @hash: hashtable of pending aggregated skbs, separated by flows + * @rx_list: list of pending ``GRO_NORMAL`` skbs + * @rx_count: cached current length of @rx_list + * @cached_napi_id: napi_struct::napi_id cached for hotpath, 0 for standalone + */ +struct gro_node { + unsigned long bitmask; + struct gro_list hash[GRO_HASH_BUCKETS]; + struct list_head rx_list; + u32 rx_count; + u32 cached_napi_id; +}; + /* * Structure for per-NAPI config */ @@ -371,7 +387,6 @@ struct napi_struct { unsigned long state; int weight; u32 defer_hard_irqs_count; - unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL /* CPU actively polling if netpoll is configured */ @@ -380,11 +395,8 @@ struct napi_struct { /* CPU on which NAPI has been scheduled for processing */ int list_owner; struct net_device *dev; - struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; - struct list_head rx_list; /* Pending GRO_NORMAL skbs */ - int rx_count; /* length of rx_list */ - unsigned int napi_id; /* protected by netdev_lock */ + struct gro_node gro; struct hrtimer timer; /* all fields past this point are write-protected by netdev_lock */ struct task_struct *thread; @@ -392,6 +404,7 @@ struct napi_struct { unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ + u32 napi_id; struct list_head dev_list; struct hlist_node napi_hash_node; int irq; @@ -4131,8 +4144,14 @@ int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list_internal(struct list_head *head); void netif_receive_skb_list(struct list_head *head); -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); -void napi_gro_flush(struct napi_struct *napi, bool flush_old); +gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb); + +static inline gro_result_t napi_gro_receive(struct napi_struct *napi, + struct sk_buff *skb) +{ + return gro_receive_skb(&napi->gro, skb); +} + struct sk_buff *napi_get_frags(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); -- cgit v1.2.3 From 859d6acd94cc4ad65e9eb3fa2a9815a19e5b35cf Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 25 Feb 2025 18:17:47 +0100 Subject: net: skbuff: introduce napi_skb_cache_get_bulk() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a function to get an array of skbs from the NAPI percpu cache. It's supposed to be a drop-in replacement for kmem_cache_alloc_bulk(skbuff_head_cache, GFP_ATOMIC) and xdp_alloc_skb_bulk(GFP_ATOMIC). The difference (apart from the requirement to call it only from the BH) is that it tries to use as many NAPI cache entries for skbs as possible, and allocate new ones only if needed. The logic is as follows: * there is enough skbs in the cache: decache them and return to the caller; * not enough: try refilling the cache first. If there is now enough skbs, return; * still not enough: try allocating skbs directly to the output array with %GFP_ZERO, maybe we'll be able to get some. If there's now enough, return; * still not enough: return as many as we were able to obtain. Most of times, if called from the NAPI polling loop, the first one will be true, sometimes (rarely) the second one. The third and the fourth -- only under heavy memory pressure. It can save significant amounts of CPU cycles if there are GRO cycles and/or Tx completion cycles (anything that descends to napi_skb_cache_put()) happening on this CPU. Tested-by: Daniel Xu Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 171aa15f6541..14517e95a46c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1320,6 +1320,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); void skb_attempt_defer_free(struct sk_buff *skb); +u32 napi_skb_cache_get_bulk(void **skbs, u32 n); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); struct sk_buff *slab_build_skb(void *data); -- cgit v1.2.3 From fd80df352ba1884ce2b62dd8d9495582308101b7 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 17 Feb 2025 14:01:56 +0000 Subject: regcache: Add support for sorting defaults arrays The defaults array in regcache must be sorted into ascending register address order, because binary search is used to locate values in the array. Add a helper to sort the register defaults array which can be useful for systems that dynamically create a defaults array based on external information. Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20250217140159.2288784-2-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 3a96d068915f..d17c5ea3d55d 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1352,6 +1352,7 @@ bool regmap_can_raw_write(struct regmap *map); size_t regmap_get_raw_read_max(struct regmap *map); size_t regmap_get_raw_write_max(struct regmap *map); +void regcache_sort_defaults(struct reg_default *defaults, unsigned int ndefaults); int regcache_sync(struct regmap *map); int regcache_sync_region(struct regmap *map, unsigned int min, unsigned int max); @@ -2043,6 +2044,12 @@ static inline bool regmap_might_sleep(struct regmap *map) return true; } +static inline void regcache_sort_defaults(struct reg_default *defaults, + unsigned int ndefaults) +{ + WARN_ONCE(1, "regmap API is disabled"); +} + static inline int regcache_sync(struct regmap *map) { WARN_ONCE(1, "regmap API is disabled"); -- cgit v1.2.3 From 69d483d5f43e7a525246090c80f978b827104ad4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 13:31:09 -0800 Subject: io_uring/nvme: pass issue_flags to io_uring_cmd_import_fixed() io_uring_cmd_import_fixed() will need to know the io_uring execution state in following commits, for now just pass issue_flags into it without actually using. Reviewed-by: Keith Busch Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250224213116.3509093-5-kbusch@meta.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index abd0c8bd950b..87150dc0a07c 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -39,7 +39,8 @@ static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd); + struct iov_iter *iter, void *ioucmd, + unsigned int issue_flags); /* * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd @@ -67,7 +68,8 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) + struct iov_iter *iter, void *ioucmd, + unsigned int issue_flags) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 0aaddfb06882504dded9cde57f91035ab9403b82 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 21 Feb 2025 18:44:22 -0800 Subject: locking/local_lock: Introduce localtry_lock_t In !PREEMPT_RT local_lock_irqsave() disables interrupts to protect critical section, but it doesn't prevent NMI, so the fully reentrant code cannot use local_lock_irqsave() for exclusive access. Introduce localtry_lock_t and localtry_lock_irqsave() that disables interrupts and sets acquired=1, so localtry_lock_irqsave() from NMI attempting to acquire the same lock will return false. In PREEMPT_RT local_lock_irqsave() maps to preemptible spin_lock(). Map localtry_lock_irqsave() to preemptible spin_trylock(). When in hard IRQ or NMI return false right away, since spin_trylock() is not safe due to explicit locking in the underneath rt_spin_trylock() implementation. Removing this explicit locking and attempting only "trylock" is undesired due to PI implications. Note there is no need to use local_inc for acquired variable, since it's a percpu variable with strict nesting scopes. Acked-by: Davidlohr Bueso Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/20250222024427.30294-2-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/local_lock.h | 70 +++++++++++++++++ include/linux/local_lock_internal.h | 146 ++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+) (limited to 'include/linux') diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 091dc0b6bdfb..1a0bc35839e3 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -51,6 +51,76 @@ #define local_unlock_irqrestore(lock, flags) \ __local_unlock_irqrestore(lock, flags) +/** + * localtry_lock_init - Runtime initialize a lock instance + */ +#define localtry_lock_init(lock) __localtry_lock_init(lock) + +/** + * localtry_lock - Acquire a per CPU local lock + * @lock: The lock variable + */ +#define localtry_lock(lock) __localtry_lock(lock) + +/** + * localtry_lock_irq - Acquire a per CPU local lock and disable interrupts + * @lock: The lock variable + */ +#define localtry_lock_irq(lock) __localtry_lock_irq(lock) + +/** + * localtry_lock_irqsave - Acquire a per CPU local lock, save and disable + * interrupts + * @lock: The lock variable + * @flags: Storage for interrupt flags + */ +#define localtry_lock_irqsave(lock, flags) \ + __localtry_lock_irqsave(lock, flags) + +/** + * localtry_trylock - Try to acquire a per CPU local lock. + * @lock: The lock variable + * + * The function can be used in any context such as NMI or HARDIRQ. Due to + * locking constrains it will _always_ fail to acquire the lock in NMI or + * HARDIRQ context on PREEMPT_RT. + */ +#define localtry_trylock(lock) __localtry_trylock(lock) + +/** + * localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable + * interrupts if acquired + * @lock: The lock variable + * @flags: Storage for interrupt flags + * + * The function can be used in any context such as NMI or HARDIRQ. Due to + * locking constrains it will _always_ fail to acquire the lock in NMI or + * HARDIRQ context on PREEMPT_RT. + */ +#define localtry_trylock_irqsave(lock, flags) \ + __localtry_trylock_irqsave(lock, flags) + +/** + * local_unlock - Release a per CPU local lock + * @lock: The lock variable + */ +#define localtry_unlock(lock) __localtry_unlock(lock) + +/** + * local_unlock_irq - Release a per CPU local lock and enable interrupts + * @lock: The lock variable + */ +#define localtry_unlock_irq(lock) __localtry_unlock_irq(lock) + +/** + * localtry_unlock_irqrestore - Release a per CPU local lock and restore + * interrupt flags + * @lock: The lock variable + * @flags: Interrupt flags to restore + */ +#define localtry_unlock_irqrestore(lock, flags) \ + __localtry_unlock_irqrestore(lock, flags) + DEFINE_GUARD(local_lock, local_lock_t __percpu*, local_lock(_T), local_unlock(_T)) diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 8dd71fbbb6d2..67bd13d142fa 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -15,6 +15,11 @@ typedef struct { #endif } local_lock_t; +typedef struct { + local_lock_t llock; + unsigned int acquired; +} localtry_lock_t; + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define LOCAL_LOCK_DEBUG_INIT(lockname) \ .dep_map = { \ @@ -31,6 +36,13 @@ static inline void local_lock_acquire(local_lock_t *l) l->owner = current; } +static inline void local_trylock_acquire(local_lock_t *l) +{ + lock_map_acquire_try(&l->dep_map); + DEBUG_LOCKS_WARN_ON(l->owner); + l->owner = current; +} + static inline void local_lock_release(local_lock_t *l) { DEBUG_LOCKS_WARN_ON(l->owner != current); @@ -45,11 +57,13 @@ static inline void local_lock_debug_init(local_lock_t *l) #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } +static inline void local_trylock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } +#define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }} #define __local_lock_init(lock) \ do { \ @@ -118,6 +132,104 @@ do { \ #define __local_unlock_nested_bh(lock) \ local_lock_release(this_cpu_ptr(lock)) +/* localtry_lock_t variants */ + +#define __localtry_lock_init(lock) \ +do { \ + __local_lock_init(&(lock)->llock); \ + WRITE_ONCE((lock)->acquired, 0); \ +} while (0) + +#define __localtry_lock(lock) \ + do { \ + localtry_lock_t *lt; \ + preempt_disable(); \ + lt = this_cpu_ptr(lock); \ + local_lock_acquire(<->llock); \ + WRITE_ONCE(lt->acquired, 1); \ + } while (0) + +#define __localtry_lock_irq(lock) \ + do { \ + localtry_lock_t *lt; \ + local_irq_disable(); \ + lt = this_cpu_ptr(lock); \ + local_lock_acquire(<->llock); \ + WRITE_ONCE(lt->acquired, 1); \ + } while (0) + +#define __localtry_lock_irqsave(lock, flags) \ + do { \ + localtry_lock_t *lt; \ + local_irq_save(flags); \ + lt = this_cpu_ptr(lock); \ + local_lock_acquire(<->llock); \ + WRITE_ONCE(lt->acquired, 1); \ + } while (0) + +#define __localtry_trylock(lock) \ + ({ \ + localtry_lock_t *lt; \ + bool _ret; \ + \ + preempt_disable(); \ + lt = this_cpu_ptr(lock); \ + if (!READ_ONCE(lt->acquired)) { \ + WRITE_ONCE(lt->acquired, 1); \ + local_trylock_acquire(<->llock); \ + _ret = true; \ + } else { \ + _ret = false; \ + preempt_enable(); \ + } \ + _ret; \ + }) + +#define __localtry_trylock_irqsave(lock, flags) \ + ({ \ + localtry_lock_t *lt; \ + bool _ret; \ + \ + local_irq_save(flags); \ + lt = this_cpu_ptr(lock); \ + if (!READ_ONCE(lt->acquired)) { \ + WRITE_ONCE(lt->acquired, 1); \ + local_trylock_acquire(<->llock); \ + _ret = true; \ + } else { \ + _ret = false; \ + local_irq_restore(flags); \ + } \ + _ret; \ + }) + +#define __localtry_unlock(lock) \ + do { \ + localtry_lock_t *lt; \ + lt = this_cpu_ptr(lock); \ + WRITE_ONCE(lt->acquired, 0); \ + local_lock_release(<->llock); \ + preempt_enable(); \ + } while (0) + +#define __localtry_unlock_irq(lock) \ + do { \ + localtry_lock_t *lt; \ + lt = this_cpu_ptr(lock); \ + WRITE_ONCE(lt->acquired, 0); \ + local_lock_release(<->llock); \ + local_irq_enable(); \ + } while (0) + +#define __localtry_unlock_irqrestore(lock, flags) \ + do { \ + localtry_lock_t *lt; \ + lt = this_cpu_ptr(lock); \ + WRITE_ONCE(lt->acquired, 0); \ + local_lock_release(<->llock); \ + local_irq_restore(flags); \ + } while (0) + #else /* !CONFIG_PREEMPT_RT */ /* @@ -125,8 +237,10 @@ do { \ * critical section while staying preemptible. */ typedef spinlock_t local_lock_t; +typedef spinlock_t localtry_lock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) +#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname) #define __local_lock_init(l) \ do { \ @@ -169,4 +283,36 @@ do { \ spin_unlock(this_cpu_ptr((lock))); \ } while (0) +/* localtry_lock_t variants */ + +#define __localtry_lock_init(lock) __local_lock_init(lock) +#define __localtry_lock(lock) __local_lock(lock) +#define __localtry_lock_irq(lock) __local_lock(lock) +#define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags) +#define __localtry_unlock(lock) __local_unlock(lock) +#define __localtry_unlock_irq(lock) __local_unlock(lock) +#define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags) + +#define __localtry_trylock(lock) \ + ({ \ + int __locked; \ + \ + if (in_nmi() | in_hardirq()) { \ + __locked = 0; \ + } else { \ + migrate_disable(); \ + __locked = spin_trylock(this_cpu_ptr((lock))); \ + if (!__locked) \ + migrate_enable(); \ + } \ + __locked; \ + }) + +#define __localtry_trylock_irqsave(lock, flags) \ + ({ \ + typecheck(unsigned long, flags); \ + flags = 0; \ + __localtry_trylock(lock); \ + }) + #endif /* CONFIG_PREEMPT_RT */ -- cgit v1.2.3 From 97769a53f117e2f33864c587d85992ee35194ecf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 21 Feb 2025 18:44:23 -0800 Subject: mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation Tracing BPF programs execute from tracepoints and kprobes where running context is unknown, but they need to request additional memory. The prior workarounds were using pre-allocated memory and BPF specific freelists to satisfy such allocation requests. Instead, introduce gfpflags_allow_spinning() condition that signals to the allocator that running context is unknown. Then rely on percpu free list of pages to allocate a page. try_alloc_pages() -> get_page_from_freelist() -> rmqueue() -> rmqueue_pcplist() will spin_trylock to grab the page from percpu free list. If it fails (due to re-entrancy or list being empty) then rmqueue_bulk()/rmqueue_buddy() will attempt to spin_trylock zone->lock and grab the page from there. spin_trylock() is not safe in PREEMPT_RT when in NMI or in hard IRQ. Bailout early in such case. The support for gfpflags_allow_spinning() mode for free_page and memcg comes in the next patches. This is a first step towards supporting BPF requirements in SLUB and getting rid of bpf_mem_alloc. That goal was discussed at LSFMM: https://lwn.net/Articles/974138/ Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Sebastian Andrzej Siewior Reviewed-by: Shakeel Butt Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250222024427.30294-3-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/gfp.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6bb1a5a7a4ae..5d9ee78c74e4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,6 +39,25 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } +static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) +{ + /* + * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. + * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. + * All GFP_* flags including GFP_NOWAIT use one or both flags. + * try_alloc_pages() is the only API that doesn't specify either flag. + * + * This is stronger than GFP_NOWAIT or GFP_ATOMIC because + * those are guaranteed to never block on a sleeping lock. + * Here we are enforcing that the allocation doesn't ever spin + * on any locks (i.e. only trylocks). There is no high level + * GFP_$FOO flag for this use in try_alloc_pages() as the + * regular page allocator doesn't fully support this + * allocation mode. + */ + return !(gfp_flags & __GFP_RECLAIM); +} + #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else @@ -335,6 +354,9 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) +struct page *try_alloc_pages_noprof(int nid, unsigned int order); +#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__)) + extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) -- cgit v1.2.3 From 8c57b687e8331eb80e302a2c528b18b966a9ac7a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 21 Feb 2025 18:44:24 -0800 Subject: mm, bpf: Introduce free_pages_nolock() Introduce free_pages_nolock() that can free pages without taking locks. It relies on trylock and can be called from any context. Since spin_trylock() cannot be used in PREEMPT_RT from hard IRQ or NMI it uses lockless link list to stash the pages which will be freed by subsequent free_pages() from good context. Do not use llist unconditionally. BPF maps continuously allocate/free, so we cannot unconditionally delay the freeing to llist. When the memory becomes free make it available to the kernel and BPF users right away if possible, and fallback to llist as the last resort. Acked-by: Vlastimil Babka Acked-by: Sebastian Andrzej Siewior Reviewed-by: Shakeel Butt Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250222024427.30294-4-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/gfp.h | 1 + include/linux/mm_types.h | 4 ++++ include/linux/mmzone.h | 3 +++ 3 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5d9ee78c74e4..ceb226c2e25c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -379,6 +379,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); +extern void free_pages_nolock(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b27db7f9496..483aa90242cd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -99,6 +99,10 @@ struct page { /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; + struct { + struct llist_node pcp_llist; + unsigned int order; + }; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9540b41894da..e16939553930 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -972,6 +972,9 @@ struct zone { /* Primarily protects free_area */ spinlock_t lock; + /* Pages to be freed when next trylock succeeds */ + struct llist_head trylock_free_pages; + /* Write-intensive fields used by compaction and vmstats. */ CACHELINE_PADDING(_pad2_); -- cgit v1.2.3 From c9eb8102e21e8c4c6fb74d1921d99e4d43713520 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 21 Feb 2025 18:44:27 -0800 Subject: bpf: Use try_alloc_pages() to allocate pages for bpf needs. Use try_alloc_pages() and free_pages_nolock() for BPF needs when context doesn't allow using normal alloc_pages. This is a prerequisite for further work. Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250222024427.30294-7-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f3f50e29d639..e1838a341817 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2348,7 +2348,7 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, -- cgit v1.2.3 From 02410ac72ac3707936c07ede66e94360d0d65319 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 26 Feb 2025 12:06:51 +0000 Subject: mm: hugetlb: Add huge page size param to huge_ptep_get_and_clear() In order to fix a bug, arm64 needs to be told the size of the huge page for which the huge_pte is being cleared in huge_ptep_get_and_clear(). Provide for this by adding an `unsigned long sz` parameter to the function. This follows the same pattern as huge_pte_clear() and set_huge_pte_at(). This commit makes the required interface modifications to the core mm as well as all arches that implement this function (arm64, loongarch, mips, parisc, powerpc, riscv, s390, sparc). The actual arm64 bug will be fixed in a separate commit. Cc: stable@vger.kernel.org Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") Acked-by: David Hildenbrand Reviewed-by: Alexandre Ghiti # riscv Reviewed-by: Christophe Leroy Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Signed-off-by: Ryan Roberts Acked-by: Alexander Gordeev # s390 Link: https://lore.kernel.org/r/20250226120656.2400136-2-ryan.roberts@arm.com Signed-off-by: Will Deacon --- include/linux/hugetlb.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec8c0ccc8f95..bf5f7256bd28 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1004,7 +1004,9 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + unsigned long psize = huge_page_size(hstate_vma(vma)); + + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } #endif -- cgit v1.2.3 From 62fb8adc43afad5fa1c9cadc6f3a8e9fb72af194 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 18 Feb 2025 15:22:05 -0700 Subject: mm: Provide address mask in struct follow_pfnmap_args follow_pfnmap_start() walks the page table for a given address and fills out the struct follow_pfnmap_args in pfnmap_args_setup(). The address mask of the page table level is already provided to this latter function for calculating the pfn. This address mask can also be useful for the caller to determine the extent of the contiguous mapping. For example, vfio-pci now supports huge_fault for pfnmaps and is able to insert pud and pmd mappings. When we DMA map these pfnmaps, ex. PCI MMIO BARs, we iterate follow_pfnmap_start() to get each pfn to test for a contiguous pfn range. Providing the mapping address mask allows us to skip the extent of the mapping level. Assuming a 1GB pud level and 4KB page size, iterations are reduced by a factor of 256K. In wall clock time, mapping a 32GB PCI BAR is reduced from ~1s to <1ms. Cc: Andrew Morton Cc: David Hildenbrand Cc: linux-mm@kvack.org Reviewed-by: Peter Xu Reviewed-by: Mitchell Augustin Tested-by: Mitchell Augustin Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/20250218222209.1382449-6-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..92b30dba7e38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2417,11 +2417,13 @@ struct follow_pfnmap_args { * Outputs: * * @pfn: the PFN of the address + * @addr_mask: address mask covering pfn * @pgprot: the pgprot_t of the mapping * @writable: whether the mapping is writable * @special: whether the mapping is a special mapping (real PFN maps) */ unsigned long pfn; + unsigned long addr_mask; pgprot_t pgprot; bool writable; bool special; -- cgit v1.2.3 From 88d5baf69082e5b410296435008329676b687549 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 27 Feb 2025 12:32:53 +1100 Subject: Change inode_operations.mkdir to return struct dentry * Some filesystems, such as NFS, cifs, ceph, and fuse, do not have complete control of sequencing on the actual filesystem (e.g. on a different server) and may find that the inode created for a mkdir request already exists in the icache and dcache by the time the mkdir request returns. For example, if the filesystem is mounted twice the directory could be visible on the other mount before it is on the original mount, and a pair of name_to_handle_at(), open_by_handle_at() calls could instantiate the directory inode with an IS_ROOT() dentry before the first mkdir returns. This means that the dentry passed to ->mkdir() may not be the one that is associated with the inode after the ->mkdir() completes. Some callers need to interact with the inode after the ->mkdir completes and they currently need to perform a lookup in the (rare) case that the dentry is no longer hashed. This lookup-after-mkdir requires that the directory remains locked to avoid races. Planned future patches to lock the dentry rather than the directory will mean that this lookup cannot be performed atomically with the mkdir. To remove this barrier, this patch changes ->mkdir to return the resulting dentry if it is different from the one passed in. Possible returns are: NULL - the directory was created and no other dentry was used ERR_PTR() - an error occurred non-NULL - this other dentry was spliced in This patch only changes file-systems to return "ERR_PTR(err)" instead of "err" or equivalent transformations. Subsequent patches will make further changes to some file-systems to return a correct dentry. Not all filesystems reliably result in a positive hashed dentry: - NFS, cifs, hostfs will sometimes need to perform a lookup of the name to get inode information. Races could result in this returning something different. Note that this lookup is non-atomic which is what we are trying to avoid. Placing the lookup in filesystem code means it only happens when the filesystem has no other option. - kernfs and tracefs leave the dentry negative and the ->revalidate operation ensures that lookup will be called to correctly populate the dentry. This could be fixed but I don't think it is important to any of the users of vfs_mkdir() which look at the dentry. The recommendation to use d_drop();d_splice_alias() is ugly but fits with current practice. A planned future patch will change this. Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250227013949.536172-2-neilb@suse.de Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c3b2f8a621f..45269608ee23 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2211,8 +2211,8 @@ struct inode_operations { int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); - int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *, - umode_t); + struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *, + struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); -- cgit v1.2.3 From 7112c05fff83e15726dd60a10248b76474e3cdf9 Mon Sep 17 00:00:00 2001 From: Xianwei Zhao Date: Wed, 12 Feb 2025 13:20:51 +0800 Subject: pinctrl: pinconf-generic: Add API for pinmux propertity in DTS file When describing pin mux func through pinmux propertity, a standard API is added for support. The pinmux contains pin identification and mux values, which can include multiple pins. And groups configuration use other word. DTS such as: func-name { group_alias: group-name{ pinmux= , ; bias-pull-up; drive-strength-microamp = <4000>; }; }; Signed-off-by: Xianwei Zhao Link: https://lore.kernel.org/20250212-amlogic-pinctrl-v5-2-282bc2516804@amlogic.com Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 53cfde98433d..1bcf071b860e 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -232,4 +232,8 @@ static inline int pinconf_generic_dt_node_to_map_all(struct pinctrl_dev *pctldev PIN_MAP_TYPE_INVALID); } +int pinconf_generic_dt_node_to_map_pinmux(struct pinctrl_dev *pctldev, + struct device_node *np, + struct pinctrl_map **map, + unsigned int *num_maps); #endif /* __LINUX_PINCTRL_PINCONF_GENERIC_H */ -- cgit v1.2.3 From f8131f4cc5bda6154d81ee5bafe3db7e2e72a89c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Feb 2025 21:09:23 +0100 Subject: net: qed: make 'qed_ll2_ops_pass' as __maybe_unused gcc warns about unused const variables even in header files when building with W=1: In file included from include/linux/qed/qed_rdma_if.h:14, from drivers/net/ethernet/qlogic/qed/qed_rdma.h:16, from drivers/net/ethernet/qlogic/qed/qed_cxt.c:23: include/linux/qed/qed_ll2_if.h:270:33: error: 'qed_ll2_ops_pass' defined but not used [-Werror=unused-const-variable=] 270 | static const struct qed_ll2_ops qed_ll2_ops_pass = { This one is intentional, so mark it as __maybe_unused to it can be included from a file that doesn't use this variable. Signed-off-by: Arnd Bergmann Reviewed-by: Simon Horman Tested-by: Simon Horman # build-tested Link: https://patch.msgid.link/20250225200926.4057723-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_ll2_if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index 5b67cd03276e..aa29ac53b833 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -267,7 +267,7 @@ struct qed_ll2_ops { int qed_ll2_alloc_if(struct qed_dev *); void qed_ll2_dealloc_if(struct qed_dev *); #else -static const struct qed_ll2_ops qed_ll2_ops_pass = { +static __maybe_unused const struct qed_ll2_ops qed_ll2_ops_pass = { .start = NULL, .stop = NULL, .start_xmit = NULL, -- cgit v1.2.3 From 9dcef93363e7f7b925b3adc4a3171bd00250c8dd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 21 Feb 2025 20:44:19 +0000 Subject: fs: Remove page_mkwrite_check_truncate() All callers of this function have now been converted to use folio_mkwrite_check_truncate(). Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250221204421.3590340-1-willy@infradead.org Tested-by: Viacheslav Dubeyko Reviewed-by: Viacheslav Dubeyko Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..7fe82d43cf39 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1602,34 +1602,6 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, return offset; } -/** - * page_mkwrite_check_truncate - check if page was truncated - * @page: the page to check - * @inode: the inode to check the page against - * - * Returns the number of bytes in the page up to EOF, - * or -EFAULT if the page was truncated. - */ -static inline int page_mkwrite_check_truncate(struct page *page, - struct inode *inode) -{ - loff_t size = i_size_read(inode); - pgoff_t index = size >> PAGE_SHIFT; - int offset = offset_in_page(size); - - if (page->mapping != inode->i_mapping) - return -EFAULT; - - /* page is wholly inside EOF */ - if (page->index < index) - return PAGE_SIZE; - /* page is wholly past EOF */ - if (page->index > index || !offset) - return -EFAULT; - /* page is partially inside EOF */ - return offset; -} - /** * i_blocks_per_folio - How many blocks fit in this folio. * @inode: The inode which contains the blocks. -- cgit v1.2.3 From 27cb27b6d5ea401143ca3648983342bb820c4be9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:14 -0800 Subject: io_uring: add support for kernel registered bvecs Provide an interface for the kernel to leverage the existing pre-registered buffers that io_uring provides. User space can reference these later to achieve zero-copy IO. User space must register an empty fixed buffer table with io_uring in order for the kernel to make use of it. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-5-kbusch@meta.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 87150dc0a07c..cf8d80d84734 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -4,6 +4,7 @@ #include #include +#include /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ #define IORING_URING_CMD_CANCELABLE (1U << 30) @@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur return cmd_to_io_kiocb(cmd)->async_data; } +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags); +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); + #endif /* _LINUX_IO_URING_CMD_H */ -- cgit v1.2.3 From ed9f3112a8a8f6e6919d3b9da2651fa302df7be3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:16 -0800 Subject: io_uring: cache nodes and mapped buffers Frequent alloc/free cycles on these is pretty costly. Use an io cache to more efficiently reuse these buffers. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-7-kbusch@meta.com [axboe: fix imu leak] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 123e69368730..432c98ff52ee 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -292,6 +292,8 @@ struct io_ring_ctx { struct io_file_table file_table; struct io_rsrc_data buf_table; + struct io_alloc_cache node_cache; + struct io_alloc_cache imu_cache; struct io_submit_state submit_state; -- cgit v1.2.3 From dea5c8ec20be7603e4a56b9d6571321f330626c1 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Feb 2025 09:16:23 +0000 Subject: net: stmmac: provide set_clk_tx_rate() hook Several stmmac sub-drivers which support RGMII follow the same pattern. They calculate the transmit clock rate, and then call clk_set_rate(). Analysis of several implementation documents suggests that the platform is responsible for providing the transmit clock to the DWMAC core's clk_tx_i. The expected rates are: 10Mbps 100Mbps 1Gbps MII 2.5MHz 25MHz RMII 2.5MHz 25MHz GMII 125MHz RGMI 2.5MHz 25MHz 125MHz It seems some platforms require this clock to be manually configured, but there are outputs from the MAC core that indicate the speed, so a platform may use these to automatically configure the clock. Thus, we can't just provide one solution to configure this clock rate. Moreover, the clock may need to be derived from one of several sources depending on the interface mode. Provide a platform hook that is passed the transmit clock, interface mode and speed. Reviewed-by: Thierry Reding Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1tna0F-0052sS-Lr@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 6d2aa77ea963..cd0d1383df87 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -78,6 +78,7 @@ | DMA_AXI_BLEN_32 | DMA_AXI_BLEN_64 \ | DMA_AXI_BLEN_128 | DMA_AXI_BLEN_256) +struct clk; struct stmmac_priv; /* Platfrom data for platform device structure's platform_data field */ @@ -231,6 +232,8 @@ struct plat_stmmacenet_data { u8 tx_sched_algorithm; struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; + int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, + phy_interface_t interface, int speed); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); int (*serdes_powerup)(struct net_device *ndev, void *priv); @@ -252,6 +255,7 @@ struct plat_stmmacenet_data { struct clk *stmmac_clk; struct clk *pclk; struct clk *clk_ptp_ref; + struct clk *clk_tx_i; /* clk_tx_i to MAC core */ unsigned long clk_ptp_rate; unsigned long clk_ref_rate; struct clk_bulk_data *clks; -- cgit v1.2.3 From 808aac63e2bdf9bae08485e072bf3d317a18acbf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Feb 2025 10:19:34 -0800 Subject: uaccess: Introduce ucopysize.h The object size sanity checking macros that uaccess.h and uio.h use have been living in thread_info.h for historical reasons. Needing to use jump labels for these checks, however, introduces a header include loop under certain conditions. The dependencies for the object checking macros are very limited, but they are used by separate header files, so introduce a new header that can be used directly by uaccess.h and uio.h. As a result, this also means thread_info.h (which is rather large) and be removed from those headers. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502281153.TG2XK5SI-lkp@intel.com/ Signed-off-by: Kees Cook --- include/linux/thread_info.h | 48 -------------------------------------- include/linux/uaccess.h | 2 +- include/linux/ucopysize.h | 56 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/uio.h | 2 +- 4 files changed, 58 insertions(+), 50 deletions(-) create mode 100644 include/linux/ucopysize.h (limited to 'include/linux') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index cf2446c9c30d..dd925d84fa46 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -217,54 +217,6 @@ static inline int arch_within_stack_frames(const void * const stack, } #endif -#ifdef CONFIG_HARDENED_USERCOPY -extern void __check_object_size(const void *ptr, unsigned long n, - bool to_user); - -static __always_inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) -{ - if (!__builtin_constant_p(n)) - __check_object_size(ptr, n, to_user); -} -#else -static inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) -{ } -#endif /* CONFIG_HARDENED_USERCOPY */ - -extern void __compiletime_error("copy source size is too small") -__bad_copy_from(void); -extern void __compiletime_error("copy destination size is too small") -__bad_copy_to(void); - -void __copy_overflow(int size, unsigned long count); - -static inline void copy_overflow(int size, unsigned long count) -{ - if (IS_ENABLED(CONFIG_BUG)) - __copy_overflow(size, count); -} - -static __always_inline __must_check bool -check_copy_size(const void *addr, size_t bytes, bool is_source) -{ - int sz = __builtin_object_size(addr, 0); - if (unlikely(sz >= 0 && sz < bytes)) { - if (!__builtin_constant_p(bytes)) - copy_overflow(sz, bytes); - else if (is_source) - __bad_copy_from(); - else - __bad_copy_to(); - return false; - } - if (WARN_ON_ONCE(bytes > INT_MAX)) - return false; - check_object_size(addr, bytes, is_source); - return true; -} - #ifndef arch_setup_new_exec static inline void arch_setup_new_exec(void) { } #endif diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index e9c702c1908d..7c06f4795670 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include diff --git a/include/linux/ucopysize.h b/include/linux/ucopysize.h new file mode 100644 index 000000000000..b3e1b875d565 --- /dev/null +++ b/include/linux/ucopysize.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Perform sanity checking for object sizes for uaccess.h and uio.h. */ +#ifndef __LINUX_UCOPYSIZE_H__ +#define __LINUX_UCOPYSIZE_H__ + +#include + +#ifdef CONFIG_HARDENED_USERCOPY +extern void __check_object_size(const void *ptr, unsigned long n, + bool to_user); + +static __always_inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ + if (!__builtin_constant_p(n)) + __check_object_size(ptr, n, to_user); +} +#else +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ } +#endif /* CONFIG_HARDENED_USERCOPY */ + +extern void __compiletime_error("copy source size is too small") +__bad_copy_from(void); +extern void __compiletime_error("copy destination size is too small") +__bad_copy_to(void); + +void __copy_overflow(int size, unsigned long count); + +static inline void copy_overflow(int size, unsigned long count) +{ + if (IS_ENABLED(CONFIG_BUG)) + __copy_overflow(size, count); +} + +static __always_inline __must_check bool +check_copy_size(const void *addr, size_t bytes, bool is_source) +{ + int sz = __builtin_object_size(addr, 0); + if (unlikely(sz >= 0 && sz < bytes)) { + if (!__builtin_constant_p(bytes)) + copy_overflow(sz, bytes); + else if (is_source) + __bad_copy_from(); + else + __bad_copy_to(); + return false; + } + if (WARN_ON_ONCE(bytes > INT_MAX)) + return false; + check_object_size(addr, bytes, is_source); + return true; +} + +#endif /* __LINUX_UCOPYSIZE_H__ */ diff --git a/include/linux/uio.h b/include/linux/uio.h index 8ada84e85447..49ece9e1888f 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -6,8 +6,8 @@ #define __LINUX_UIO_H #include -#include #include +#include #include struct page; -- cgit v1.2.3 From 496d2d23886436f7c651bf4c14950eb002815c61 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 23 Jan 2025 22:11:14 +0000 Subject: mm: security: Check early if HARDENED_USERCOPY is enabled HARDENED_USERCOPY is checked within a function so even if disabled, the function overhead still exists. Move the static check inline. This is at best a micro-optimisation and any difference in performance was within noise but it is relatively consistent with the init_on_* implementations. Suggested-by: Kees Cook Signed-off-by: Mel Gorman Link: https://lore.kernel.org/r/20250123221115.19722-4-mgorman@techsingularity.net Signed-off-by: Kees Cook --- include/linux/ucopysize.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ucopysize.h b/include/linux/ucopysize.h index b3e1b875d565..41c2d9720466 100644 --- a/include/linux/ucopysize.h +++ b/include/linux/ucopysize.h @@ -6,14 +6,21 @@ #include #ifdef CONFIG_HARDENED_USERCOPY +#include extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); +DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + validate_usercopy_range); + static __always_inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { - if (!__builtin_constant_p(n)) + if (!__builtin_constant_p(n) && + static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + &validate_usercopy_range)) { __check_object_size(ptr, n, to_user); + } } #else static inline void check_object_size(const void *ptr, unsigned long n, -- cgit v1.2.3 From 3b62449da444502d8b87bf090d8ec4a57d47eda5 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 24 Feb 2025 15:13:51 +0100 Subject: driver core: Introduce device_{add,remove}_of_node() An of_node can be set to a device using device_set_node(), which does not prevent any of_node and/or fwnode overwrites. When adding an of_node on an already present device, the following operations need to be done: - Attach the of_node only if no of_node is already attached - Attach the of_node as a fwnode if no fwnode were already attached This is the purpose of device_add_of_node(). device_remove_of_node() reverts the operations done by device_add_of_node(). Link: https://lore.kernel.org/r/20250224141356.36325-2-herve.codina@bootlin.com Signed-off-by: Herve Codina Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring (Arm) Acked-by: Greg Kroah-Hartman --- include/linux/device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 80a5b3268986..1244e5892292 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1191,6 +1191,8 @@ int device_online(struct device *dev); void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void device_set_node(struct device *dev, struct fwnode_handle *fwnode); +int device_add_of_node(struct device *dev, struct device_node *of_node); +void device_remove_of_node(struct device *dev); void device_set_of_node_from_dev(struct device *dev, const struct device *dev2); static inline struct device_node *dev_of_node(struct device *dev) -- cgit v1.2.3 From 4b31eb55dbc64d72ff57a1888f8e4d7996a693bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 17 Feb 2025 11:25:02 +0100 Subject: pwm: Check for CONFIG_PWM using IS_REACHABLE() in main header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preparing CONFIG_PWM becoming tristate the right magic to check for the availability of the pwm functions is using IS_REACHABLE() and not IS_ENABLED(). The latter gives the wrong result for built-in code with CONFIG_PWM=m. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20250217102504.687916-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index a2df509056ac..9ece4e5d3815 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -379,7 +379,7 @@ static inline void pwmchip_set_drvdata(struct pwm_chip *chip, void *data) dev_set_drvdata(&chip->dev, data); } -#if IS_ENABLED(CONFIG_PWM) +#if IS_REACHABLE(CONFIG_PWM) /* PWM consumer APIs */ int pwm_round_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf); @@ -661,7 +661,7 @@ struct pwm_lookup { PWM_LOOKUP_WITH_MODULE(_provider, _index, _dev_id, _con_id, _period, \ _polarity, NULL) -#if IS_ENABLED(CONFIG_PWM) +#if IS_REACHABLE(CONFIG_PWM) void pwm_add_table(struct pwm_lookup *table, size_t num); void pwm_remove_table(struct pwm_lookup *table, size_t num); #else -- cgit v1.2.3 From 0c542a69cbcd1fefad32c59cea7a80413fe60922 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 15:15:13 -0700 Subject: io_uring/uring_cmd: specify io_uring_cmd_import_fixed() pointer type io_uring_cmd_import_fixed() takes a struct io_uring_cmd *, but the type of the ioucmd parameter is void *. Make the pointer type explicit so the compiler can type check it. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228221514.604350-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cf8d80d84734..5bc4f0d58506 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -40,7 +40,8 @@ static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd, + struct iov_iter *iter, + struct io_uring_cmd *ioucmd, unsigned int issue_flags); /* @@ -68,9 +69,10 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); #else -static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd, - unsigned int issue_flags) +static inline int +io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, struct io_uring_cmd *ioucmd, + unsigned int issue_flags) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 09fdd35162c289f354326a55d552a8858f6e8072 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:03:04 -0700 Subject: io_uring: convert cmd_to_io_kiocb() macro to function The cmd_to_io_kiocb() macro applies a pointer cast to its input without parenthesizing it. Currently all inputs are variable names, so this has the intended effect. But since casts have relatively high precedence, the macro would apply the cast to the wrong value if the input was a pointer addition, for example. Turn the macro into a static inline function to ensure the pointer cast is applied to the full input value. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228230305.630885-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 432c98ff52ee..72aac84dca93 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -607,7 +607,11 @@ static inline void io_kiocb_cmd_sz_check(size_t cmd_sz) io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ ((cmd_type *)&(req)->cmd) \ ) -#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) + +static inline struct io_kiocb *cmd_to_io_kiocb(void *ptr) +{ + return ptr; +} struct io_kiocb { union { -- cgit v1.2.3 From e6ea7ec494881bcf61b8f0f77f7cb3542f717ff2 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:14:31 -0700 Subject: io_uring/ublk: report error when unregister operation fails Indicate to userspace applications if a UBLK_IO_UNREGISTER_IO_BUF command specifies an invalid buffer index by returning an error code. Return -EINVAL if no buffer is registered with the given index, and -EBUSY if the registered buffer is not a kernel bvec. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228231432.642417-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 5bc4f0d58506..598cacda4aa3 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -131,7 +131,7 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); -void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, - unsigned int issue_flags); +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); #endif /* _LINUX_IO_URING_CMD_H */ -- cgit v1.2.3 From dc4d58a752ea6cb0821d889e8412c22d5289f3d3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 18 Feb 2025 14:40:02 -0600 Subject: perf: arm_pmu: Move PMUv3-specific data A few fields in struct arm_pmu are only used with PMUv3, and soon we will need to add more for BRBE. Group the fields together so that we have a logical place to add more data in future. At the same time, remove the comment for reg_pmmir as it doesn't convey anything useful. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Reviewed-by: Anshuman Khandual Tested-by: James Clark Link: https://lore.kernel.org/r/20250218-arm-brbe-v19-v20-7-4e9922fc2e8e@kernel.org Signed-off-by: Will Deacon --- include/linux/perf/arm_pmu.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 4b5b83677e3f..c70d528594f2 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -84,7 +84,6 @@ struct arm_pmu { struct pmu pmu; cpumask_t supported_cpus; char *name; - int pmuver; irqreturn_t (*handle_irq)(struct arm_pmu *pmu); void (*enable)(struct perf_event *event); void (*disable)(struct perf_event *event); @@ -102,18 +101,20 @@ struct arm_pmu { int (*map_event)(struct perf_event *event); DECLARE_BITMAP(cntr_mask, ARMPMU_MAX_HWEVENTS); bool secure_access; /* 32-bit ARM only */ -#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 - DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); -#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000 - DECLARE_BITMAP(pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); struct platform_device *plat_device; struct pmu_hw_events __percpu *hw_events; struct hlist_node node; struct notifier_block cpu_pm_nb; /* the attr_groups array must be NULL-terminated */ const struct attribute_group *attr_groups[ARMPMU_NR_ATTR_GROUPS + 1]; - /* store the PMMIR_EL1 to expose slots */ + + /* PMUv3 only */ + int pmuver; u64 reg_pmmir; +#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 + DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); +#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000 + DECLARE_BITMAP(pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); /* Only to be used by ACPI probing code */ unsigned long acpi_cpuid; -- cgit v1.2.3 From 916b7f42b3b3b539a71c204a9b49fdc4ca92cd82 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 15:06:31 -0800 Subject: kvm: retry nx_huge_page_recovery_thread creation A VMM may send a non-fatal signal to its threads, including vCPU tasks, at any time, and thus may signal vCPU tasks during KVM_RUN. If a vCPU task receives the signal while its trying to spawn the huge page recovery vhost task, then KVM_RUN will fail due to copy_process() returning -ERESTARTNOINTR. Rework call_once() to mark the call complete if and only if the called function succeeds, and plumb the function's true error code back to the call_once() invoker. This provides userspace with the correct, non-fatal error code so that the VMM doesn't terminate the VM on -ENOMEM, and allows subsequent KVM_RUN a succeed by virtue of retrying creation of the NX huge page task. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson [implemented the kvm user side] Signed-off-by: Keith Busch Message-ID: <20250227230631.303431-3-kbusch@meta.com> Signed-off-by: Paolo Bonzini --- include/linux/call_once.h | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/call_once.h b/include/linux/call_once.h index 6261aa0b3fb0..13cd6469e7e5 100644 --- a/include/linux/call_once.h +++ b/include/linux/call_once.h @@ -26,20 +26,41 @@ do { \ __once_init((once), #once, &__key); \ } while (0) -static inline void call_once(struct once *once, void (*cb)(struct once *)) +/* + * call_once - Ensure a function has been called exactly once + * + * @once: Tracking struct + * @cb: Function to be called + * + * If @once has never completed successfully before, call @cb and, if + * it returns a zero or positive value, mark @once as completed. Return + * the value returned by @cb + * + * If @once has completed succesfully before, return 0. + * + * The call to @cb is implicitly surrounded by a mutex, though for + * efficiency the * function avoids taking it after the first call. + */ +static inline int call_once(struct once *once, int (*cb)(struct once *)) { - /* Pairs with atomic_set_release() below. */ - if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) - return; - - guard(mutex)(&once->lock); - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); - if (atomic_read(&once->state) != ONCE_NOT_STARTED) - return; - - atomic_set(&once->state, ONCE_RUNNING); - cb(once); - atomic_set_release(&once->state, ONCE_COMPLETED); + int r, state; + + /* Pairs with atomic_set_release() below. */ + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) + return 0; + + guard(mutex)(&once->lock); + state = atomic_read(&once->state); + if (unlikely(state != ONCE_NOT_STARTED)) + return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0; + + atomic_set(&once->state, ONCE_RUNNING); + r = cb(once); + if (r < 0) + atomic_set(&once->state, ONCE_NOT_STARTED); + else + atomic_set_release(&once->state, ONCE_COMPLETED); + return r; } #endif /* _LINUX_CALL_ONCE_H */ -- cgit v1.2.3 From e249056c91a2f14ee40de2bf24cf72d8e68101f5 Mon Sep 17 00:00:00 2001 From: Pan Deng Date: Fri, 28 Feb 2025 10:00:59 +0800 Subject: fs: place f_ref to 3rd cache line in struct file to resolve false sharing When running syscall pread in a high core count system, f_ref contends with the reading of f_mode, f_op, f_mapping, f_inode, f_flags in the same cache line. This change places f_ref to the 3rd cache line where fields are not updated as frequently as the 1st cache line, and the contention is grealy reduced according to tests. In addition, the size of file object is kept in 3 cache lines. This change has been tested with rocksdb benchmark readwhilewriting case in 1 socket 64 physical core 128 logical core baremetal machine, with build config CONFIG_RANDSTRUCT_NONE=y Command: ./db_bench --benchmarks="readwhilewriting" --threads $cnt --duration 60 The throughput(ops/s) is improved up to ~21%. ===== thread baseline compare 16 100% +1.3% 32 100% +2.2% 64 100% +7.2% 128 100% +20.9% It was also tested with UnixBench: syscall, fsbuffer, fstime, fsdisk cases that has been used for file struct layout tuning, no regression was observed. Signed-off-by: Pan Deng Link: https://lore.kernel.org/r/20250228020059.3023375-1-pan.deng@intel.com Tested-by: Lipeng Zhu Reviewed-by: Tianyou Li Reviewed-by: Tim Chen Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 70c985f3d787..9ab789cd1531 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1058,7 +1058,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) /** * struct file - Represents a file - * @f_ref: reference count * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations @@ -1068,12 +1067,12 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener + * @f_owner: file owner * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file - * @f_owner: file owner * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file @@ -1081,9 +1080,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) + * @f_ref: reference count */ struct file { - file_ref_t f_ref; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; @@ -1093,6 +1092,7 @@ struct file { unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; + struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { @@ -1106,7 +1106,6 @@ struct file { void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ - struct fown_struct *f_owner; errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL @@ -1118,6 +1117,7 @@ struct file { struct file_ra_state f_ra; freeptr_t f_freeptr; }; + file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ -- cgit v1.2.3 From dc90c890363d3e4b0ec73cd21b5be592692723fd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 3 Feb 2025 15:07:30 +0100 Subject: asm-generic/io.h: rework split ioread64/iowrite64 helpers There are two incompatible sets of definitions of these eight functions: On 64-bit architectures setting CONFIG_HAS_IOPORT, they turn into either pair of 32-bit PIO (inl/outl) accesses or a single 64-bit MMIO (readq/writeq). On other 64-bit architectures, they are always split into 32-bit accesses. Depending on which header gets included in a driver, there are additionally definitions for ioread64()/iowrite64() that are expected to produce a 64-bit register MMIO access on all 64-bit architectures. To separate the conflicting definitions, make the version in include/linux/io-64-nonatomic-*.h visible on all architectures but pick the one from lib/iomap.c on architectures that set CONFIG_GENERIC_IOMAP in place of the default fallback. Acked-by: Andy Shevchenko Signed-off-by: Arnd Bergmann --- include/linux/io-64-nonatomic-hi-lo.h | 16 ++++++++++++++++ include/linux/io-64-nonatomic-lo-hi.h | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-64-nonatomic-hi-lo.h b/include/linux/io-64-nonatomic-hi-lo.h index f32522bb3aa5..d3eade7cf663 100644 --- a/include/linux/io-64-nonatomic-hi-lo.h +++ b/include/linux/io-64-nonatomic-hi-lo.h @@ -101,22 +101,38 @@ static inline void iowrite64be_hi_lo(u64 val, void __iomem *addr) #ifndef ioread64 #define ioread64_is_nonatomic +#if defined(CONFIG_GENERIC_IOMAP) && defined(CONFIG_64BIT) +#define ioread64 __ioread64_hi_lo +#else #define ioread64 ioread64_hi_lo #endif +#endif #ifndef iowrite64 #define iowrite64_is_nonatomic +#if defined(CONFIG_GENERIC_IOMAP) && defined(CONFIG_64BIT) +#define iowrite64 __iowrite64_hi_lo +#else #define iowrite64 iowrite64_hi_lo #endif +#endif #ifndef ioread64be #define ioread64be_is_nonatomic +#if defined(CONFIG_GENERIC_IOMAP) && defined(CONFIG_64BIT) +#define ioread64be __ioread64be_hi_lo +#else #define ioread64be ioread64be_hi_lo #endif +#endif #ifndef iowrite64be #define iowrite64be_is_nonatomic +#if defined(CONFIG_GENERIC_IOMAP) && defined(CONFIG_64BIT) +#define iowrite64be __iowrite64be_hi_lo +#else #define iowrite64be iowrite64be_hi_lo #endif +#endif #endif /* _LINUX_IO_64_NONATOMIC_HI_LO_H_ */ diff --git a/include/linux/io-64-nonatomic-lo-hi.h b/include/linux/io-64-nonatomic-lo-hi.h index 448a21435dba..94e676ec3d3f 100644 --- a/include/linux/io-64-nonatomic-lo-hi.h +++ b/include/linux/io-64-nonatomic-lo-hi.h @@ -101,22 +101,38 @@ static inline void iowrite64be_lo_hi(u64 val, void __iomem *addr) #ifndef ioread64 #define ioread64_is_nonatomic +#if defined(CONFIG_GENERIC_IOMAP) && defined(CONFIG_64BIT) +#define ioread64 __ioread64_lo_hi +#else #define ioread64 ioread64_lo_hi #endif +#endif #ifndef iowrite64 #define iowrite64_is_nonatomic +#if defined(CONFIG_GENERIC_IOMAP) && defined(CONFIG_64BIT) +#define iowrite64 __iowrite64_lo_hi +#else #define iowrite64 iowrite64_lo_hi #endif +#endif #ifndef ioread64be #define ioread64be_is_nonatomic +#if defined(CONFIG_GENERIC_IOMAP) && defined(CONFIG_64BIT) +#define ioread64be __ioread64be_lo_hi +#else #define ioread64be ioread64be_lo_hi #endif +#endif #ifndef iowrite64be #define iowrite64be_is_nonatomic +#if defined(CONFIG_GENERIC_IOMAP) && defined(CONFIG_64BIT) +#define iowrite64be __iowrite64be_lo_hi +#else #define iowrite64be iowrite64be_lo_hi #endif +#endif #endif /* _LINUX_IO_64_NONATOMIC_LO_HI_H_ */ -- cgit v1.2.3 From 9e81c965742c2a9a28cf59416fee40e7cb1b773e Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 27 Feb 2025 12:37:47 +0530 Subject: io.h: drop unused headers Drop unused headers and type declaration from io.h. Signed-off-by: Raag Jadav Acked-by: Andy Shevchenko Signed-off-by: Arnd Bergmann --- include/linux/io.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index 59ec5eea696c..cc2c6ebdc585 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -9,13 +9,10 @@ #include #include #include -#include -#include #include #include struct device; -struct resource; #ifndef __iowrite32_copy void __iowrite32_copy(void __iomem *to, const void *from, size_t count); -- cgit v1.2.3 From e04918dc594669068f5d59d567d08db531167188 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 2 Mar 2025 15:18:24 +0800 Subject: cred: Fix RCU warnings in override/revert_creds Fix RCU warnings in override_creds and revert_creds by turning the RCU pointer into a normal pointer using rcu_replace_pointer. These warnings were previously private to the cred code, but due to the move into the header file they are now polluting unrelated subsystems. Fixes: 49dffdfde462 ("cred: Add a light version of override/revert_creds()") Signed-off-by: Herbert Xu Link: https://lore.kernel.org/r/Z8QGQGW0IaSklKG7@gondor.apana.org.au Signed-off-by: Christian Brauner --- include/linux/cred.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 0c3c4b16b469..5658a3bfe803 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -172,18 +172,12 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) static inline const struct cred *override_creds(const struct cred *override_cred) { - const struct cred *old = current->cred; - - rcu_assign_pointer(current->cred, override_cred); - return old; + return rcu_replace_pointer(current->cred, override_cred, 1); } static inline const struct cred *revert_creds(const struct cred *revert_cred) { - const struct cred *override_cred = current->cred; - - rcu_assign_pointer(current->cred, revert_cred); - return override_cred; + return rcu_replace_pointer(current->cred, revert_cred, 1); } /** -- cgit v1.2.3 From fafac0ebb289288c767272d600644701e0df8a76 Mon Sep 17 00:00:00 2001 From: Mikhail Paulyshka Date: Sun, 2 Mar 2025 18:50:09 +0300 Subject: hwmon: (k10temp) add support for cyan skillfish Add support for Cyan Skillfish (AMD Family 17h Model 47h), which appear to be Zen 2 based APU. The patch was tested with an AMD BC-250 board. Signed-off-by: Mikhail Paulyshka Link: https://lore.kernel.org/r/20250302155009.49951-1-me@mixaill.net Signed-off-by: Guenter Roeck --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index de5deb1a0118..7a4ad478487d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -569,6 +569,7 @@ #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493 +#define PCI_DEVICE_ID_AMD_17H_M40H_DF_F3 0x13f3 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 #define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3 0x1727 -- cgit v1.2.3 From 6224e7fc1ce75edcd03b56a2e0fd4c1765d5888e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 27 Feb 2025 09:37:47 +0100 Subject: gpiolib: deprecate gpio_chip::set and gpio_chip::set_multiple We now have setter callbacks that allow us to indicate success or failure using the integer return value. Deprecate the older callbacks so that no new code is tempted to use them. Link: https://lore.kernel.org/r/20250227083748.22400-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index a2a1b6434321..783897d94be8 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -346,8 +346,8 @@ struct gpio_irq_chip { * @get: returns value for signal "offset", 0=low, 1=high, or negative error * @get_multiple: reads values for multiple signals defined by "mask" and * stores them in "bits", returns 0 on success or negative error - * @set: assigns output value for signal "offset" - * @set_multiple: assigns output values for multiple signals defined by "mask" + * @set: **DEPRECATED** - please use set_rv() instead + * @set_multiple: **DEPRECATED** - please use set_multiple_rv() instead * @set_rv: assigns output value for signal "offset", returns 0 on success or * negative error value * @set_multiple_rv: assigns output values for multiple signals defined by -- cgit v1.2.3 From 7636f090d02e791918bb3c924e695880123d0c59 Mon Sep 17 00:00:00 2001 From: Pengyu Luo Date: Sat, 15 Feb 2025 02:06:55 +0800 Subject: platform: arm64: add Huawei Matebook E Go EC driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are three variants of which Huawei released the first two simultaneously. Huawei Matebook E Go LTE(sc8180x), codename seems to be gaokun2. Huawei Matebook E Go(sc8280xp@3.0GHz), codename must be gaokun3. (see [1]) Huawei Matebook E Go 2023(sc8280xp@2.69GHz), codename should be also gaokun3. Adding support for the latter two variants for now, this driver should also work for the sc8180x variant according to acpi table files, but I don't have the device to test yet. Different from other Qualcomm Snapdragon sc8280xp based machines, the Huawei Matebook E Go uses an embedded controller while others use a system called PMIC GLink. This embedded controller can be used to perform a set of various functions, including, but not limited to: - Battery and charger monitoring; - Charge control and smart charge; - Fn_lock settings; - Tablet lid status; - Temperature sensors; - USB Type-C notifications (ports orientation, DP alt mode HPD); - USB Type-C PD (according to observation, up to 48w). Add a driver for the EC which creates devices for UCSI and power supply devices. This driver is inspired by the following drivers: drivers/platform/arm64/acer-aspire1-ec.c drivers/platform/arm64/lenovo-yoga-c630.c drivers/platform/x86/huawei-wmi.c Also thanks for reviewers' working. They have made this patch improve a lot. [1] https://bugzilla.kernel.org/show_bug.cgi?id=219645 Signed-off-by: Pengyu Luo Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20250214180656.28599-3-mitltlatltl@gmail.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/huawei-gaokun-ec.h | 79 ++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 include/linux/platform_data/huawei-gaokun-ec.h (limited to 'include/linux') diff --git a/include/linux/platform_data/huawei-gaokun-ec.h b/include/linux/platform_data/huawei-gaokun-ec.h new file mode 100644 index 000000000000..faa15d315128 --- /dev/null +++ b/include/linux/platform_data/huawei-gaokun-ec.h @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Huawei Matebook E Go Embedded Controller + * + * Copyright (C) 2024-2025 Pengyu Luo + */ + +#ifndef __HUAWEI_GAOKUN_EC_H__ +#define __HUAWEI_GAOKUN_EC_H__ + +#define GAOKUN_UCSI_CCI_SIZE 4 +#define GAOKUN_UCSI_MSGI_SIZE 16 +#define GAOKUN_UCSI_READ_SIZE (GAOKUN_UCSI_CCI_SIZE + GAOKUN_UCSI_MSGI_SIZE) +#define GAOKUN_UCSI_WRITE_SIZE 24 /* 8B CTRL, 16B MSGO */ + +#define GAOKUN_UCSI_NO_PORT_UPDATE (-1) + +#define GAOKUN_SMART_CHARGE_DATA_SIZE 4 /* mode, delay, start, end */ + +/* -------------------------------------------------------------------------- */ + +struct gaokun_ec; +struct gaokun_ucsi_reg; +struct notifier_block; + +#define GAOKUN_MOD_NAME "huawei_gaokun_ec" +#define GAOKUN_DEV_PSY "psy" +#define GAOKUN_DEV_UCSI "ucsi" + +/* -------------------------------------------------------------------------- */ +/* Common API */ + +int gaokun_ec_register_notify(struct gaokun_ec *ec, + struct notifier_block *nb); +void gaokun_ec_unregister_notify(struct gaokun_ec *ec, + struct notifier_block *nb); + +int gaokun_ec_read(struct gaokun_ec *ec, const u8 *req, + size_t resp_len, u8 *resp); +int gaokun_ec_write(struct gaokun_ec *ec, const u8 *req); +int gaokun_ec_read_byte(struct gaokun_ec *ec, const u8 *req, u8 *byte); + +/* -------------------------------------------------------------------------- */ +/* API for PSY */ + +int gaokun_ec_psy_multi_read(struct gaokun_ec *ec, u8 reg, + size_t resp_len, u8 *resp); + +static inline int gaokun_ec_psy_read_byte(struct gaokun_ec *ec, + u8 reg, u8 *byte) +{ + return gaokun_ec_psy_multi_read(ec, reg, sizeof(*byte), byte); +} + +static inline int gaokun_ec_psy_read_word(struct gaokun_ec *ec, + u8 reg, u16 *word) +{ + return gaokun_ec_psy_multi_read(ec, reg, sizeof(*word), (u8 *)word); +} + +int gaokun_ec_psy_get_smart_charge(struct gaokun_ec *ec, + u8 resp[GAOKUN_SMART_CHARGE_DATA_SIZE]); +int gaokun_ec_psy_set_smart_charge(struct gaokun_ec *ec, + const u8 req[GAOKUN_SMART_CHARGE_DATA_SIZE]); + +int gaokun_ec_psy_get_smart_charge_enable(struct gaokun_ec *ec, bool *on); +int gaokun_ec_psy_set_smart_charge_enable(struct gaokun_ec *ec, bool on); + +/* -------------------------------------------------------------------------- */ +/* API for UCSI */ + +int gaokun_ec_ucsi_read(struct gaokun_ec *ec, u8 resp[GAOKUN_UCSI_READ_SIZE]); +int gaokun_ec_ucsi_write(struct gaokun_ec *ec, + const u8 req[GAOKUN_UCSI_WRITE_SIZE]); + +int gaokun_ec_ucsi_get_reg(struct gaokun_ec *ec, struct gaokun_ucsi_reg *ureg); +int gaokun_ec_ucsi_pan_ack(struct gaokun_ec *ec, int port_id); + +#endif /* __HUAWEI_GAOKUN_EC_H__ */ -- cgit v1.2.3 From 9fab91e0eae3a9ec1861119877a61d571f21536a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 23 Feb 2025 16:06:02 +0000 Subject: usb: ulpi: Remove unused otg_ulpi_create otg_ulpi_create() has been unused since 2022's commit 8ca79aaad8be ("ARM: pxa: remove unused pxa3xx-ulpi") Remove it. The devm_ variant is still used. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250223160602.91916-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/ulpi.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h index 5050f502c1ed..4b651065738a 100644 --- a/include/linux/usb/ulpi.h +++ b/include/linux/usb/ulpi.h @@ -49,19 +49,10 @@ /*-------------------------------------------------------------------------*/ #if IS_ENABLED(CONFIG_USB_ULPI) -struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, - unsigned int flags); - struct usb_phy *devm_otg_ulpi_create(struct device *dev, struct usb_phy_io_ops *ops, unsigned int flags); #else -static inline struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops, - unsigned int flags) -{ - return NULL; -} - static inline struct usb_phy *devm_otg_ulpi_create(struct device *dev, struct usb_phy_io_ops *ops, unsigned int flags) -- cgit v1.2.3 From 7ab02bd36eb444654183ad6c5b15211ddfa32a8f Mon Sep 17 00:00:00 2001 From: "Nysal Jan K.A." Date: Mon, 3 Mar 2025 11:34:50 +0530 Subject: sched/membarrier: Fix redundant load of membarrier_state On architectures where ARCH_HAS_SYNC_CORE_BEFORE_USERMODE is not selected, sync_core_before_usermode() is a no-op. In membarrier_mm_sync_core_before_usermode() the compiler does not eliminate redundant branches and load of mm->membarrier_state for this case as the atomic_read() cannot be optimized away. Here's a snippet of the code generated for finish_task_switch() on powerpc prior to this change: 1b786c: ld r26,2624(r30) # mm = rq->prev_mm; ....... 1b78c8: cmpdi cr7,r26,0 1b78cc: beq cr7,1b78e4 1b78d0: ld r9,2312(r13) # current 1b78d4: ld r9,1888(r9) # current->mm 1b78d8: cmpd cr7,r26,r9 1b78dc: beq cr7,1b7a70 1b78e0: hwsync 1b78e4: cmplwi cr7,r27,128 ....... 1b7a70: lwz r9,176(r26) # atomic_read(&mm->membarrier_state) 1b7a74: b 1b78e0 This was found while analyzing "perf c2c" reports on kernels prior to commit c1753fd02a00 ("mm: move mm_count into its own cache line") where mm_count was false sharing with membarrier_state. There is a minor improvement in the size of finish_task_switch(). The following are results from bloat-o-meter for ppc64le: GCC 7.5.0 --------- add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-32 (-32) Function old new delta finish_task_switch 884 852 -32 GCC 12.2.1 ---------- add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-32 (-32) Function old new delta finish_task_switch.isra 852 820 -32 LLVM 17.0.6 ----------- add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-36 (-36) Function old new delta rt_mutex_schedule 120 104 -16 finish_task_switch 792 772 -20 Results on aarch64: GCC 14.1.1 ---------- add/remove: 0/2 grow/shrink: 1/1 up/down: 4/-60 (-56) Function old new delta get_nohz_timer_target 352 356 +4 e843419@0b02_0000d7e7_408 8 - -8 e843419@01bb_000021d2_868 8 - -8 finish_task_switch.isra 592 548 -44 Signed-off-by: Nysal Jan K.A. Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Reviewed-by: Michael Ellerman Reviewed-by: Segher Boessenkool Link: https://lore.kernel.org/r/20250303060457.531293-1-nysal@linux.ibm.com --- include/linux/sched/mm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 928a626725e6..b13474825130 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -531,6 +531,13 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + /* + * The atomic_read() below prevents CSE. The following should + * help the compiler generate more efficient code on architectures + * where sync_core_before_usermode() is a no-op. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) + return; if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & -- cgit v1.2.3 From 520a552f19d55825108ab83da093084c9afb50e9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Feb 2025 21:20:46 +0100 Subject: PM: sleep: Avoid unnecessary checks in device_prepare_smart_suspend() Add an optimization (on top of previous changes) to avoid calling pm_runtime_blocked(), which involves acquiring the device's PM spinlock, for devices with no PM callbacks and runtime PM "blocked". Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/2978873.e9J7NaK4W3@rjwysocki.net --- include/linux/pm_runtime.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index aea0395c10a1..01ead602aedd 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -77,7 +77,7 @@ extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); -extern void pm_runtime_block_if_disabled(struct device *dev); +extern bool pm_runtime_block_if_disabled(struct device *dev); extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); @@ -274,7 +274,7 @@ static inline int pm_runtime_get_if_active(struct device *dev) static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } static inline int pm_runtime_barrier(struct device *dev) { return 0; } -static inline void pm_runtime_block_if_disabled(struct device *dev) {} +static inline bool pm_runtime_block_if_disabled(struct device *dev) { return true; } static inline void pm_runtime_unblock(struct device *dev) {} static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} -- cgit v1.2.3 From 630d55e038728f6f5917da051984a5ec515d152e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 21 Feb 2025 05:02:19 -0800 Subject: PM: wakeup: Remove needless return in three void APIs Remove needless 'return' in the following void APIs: __pm_wakeup_event() pm_wakeup_event() pm_wakeup_hard_event() Since both the API and callee involved are void functions. Signed-off-by: Zijun Hu Link: https://patch.msgid.link/20250221-rmv_return-v1-14-cc8dff275827@quicinc.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeup.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index d501c09c60cd..51e0e8dd5f9e 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -205,17 +205,17 @@ static inline void device_set_awake_path(struct device *dev) static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) { - return pm_wakeup_ws_event(ws, msec, false); + pm_wakeup_ws_event(ws, msec, false); } static inline void pm_wakeup_event(struct device *dev, unsigned int msec) { - return pm_wakeup_dev_event(dev, msec, false); + pm_wakeup_dev_event(dev, msec, false); } static inline void pm_wakeup_hard_event(struct device *dev) { - return pm_wakeup_dev_event(dev, 0, true); + pm_wakeup_dev_event(dev, 0, true); } /** -- cgit v1.2.3 From 7304d1909080ef0c9da703500a97f46c98393fcd Mon Sep 17 00:00:00 2001 From: Md Sadre Alam Date: Mon, 24 Feb 2025 16:44:14 +0530 Subject: spi: spi-qpic: add driver for QCOM SPI NAND flash Interface This driver implements support for the SPI-NAND mode of QCOM NAND Flash Interface as a SPI-MEM controller with pipelined ECC capability. Co-developed-by: Sricharan Ramabadhran Signed-off-by: Sricharan Ramabadhran Co-developed-by: Varadarajan Narayanan Signed-off-by: Varadarajan Narayanan Signed-off-by: Md Sadre Alam Link: https://patch.msgid.link/20250224111414.2809669-3-quic_mdalam@quicinc.com Signed-off-by: Mark Brown --- include/linux/mtd/nand-qpic-common.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h index 4d9b736ff8b7..7760154de581 100644 --- a/include/linux/mtd/nand-qpic-common.h +++ b/include/linux/mtd/nand-qpic-common.h @@ -325,6 +325,10 @@ struct nandc_regs { __le32 read_location_last1; __le32 read_location_last2; __le32 read_location_last3; + __le32 spi_cfg; + __le32 num_addr_cycle; + __le32 busy_wait_cnt; + __le32 flash_feature; __le32 erased_cw_detect_cfg_clr; __le32 erased_cw_detect_cfg_set; @@ -339,6 +343,7 @@ struct nandc_regs { * * @core_clk: controller clock * @aon_clk: another controller clock + * @iomacro_clk: io macro clock * * @regs: a contiguous chunk of memory for DMA register * writes. contains the register values to be @@ -348,6 +353,7 @@ struct nandc_regs { * initialized via DT match data * * @controller: base controller structure + * @qspi: qpic spi structure * @host_list: list containing all the chips attached to the * controller * @@ -392,6 +398,7 @@ struct qcom_nand_controller { const struct qcom_nandc_props *props; struct nand_controller *controller; + struct qpic_spi_nand *qspi; struct list_head host_list; union { -- cgit v1.2.3 From eeb87d17aceab7803a5a5bcb6cf2817b745157cf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 27 Feb 2025 11:53:50 +0100 Subject: PM: sleep: Adjust check before setting power.must_resume The check before setting power.must_resume in device_suspend_noirq() does not take power.child_count into account, but it should do that, so use pm_runtime_need_not_resume() in it for this purpose and adjust the comment next to it accordingly. Fixes: 107d47b2b95e ("PM: sleep: core: Simplify the SMART_SUSPEND flag handling") Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/3353728.44csPzL39Z@rjwysocki.net --- include/linux/pm_runtime.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 01ead602aedd..4c5b74b745d5 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -66,6 +66,7 @@ static inline bool queue_pm_work(struct work_struct *work) extern int pm_generic_runtime_suspend(struct device *dev); extern int pm_generic_runtime_resume(struct device *dev); +extern bool pm_runtime_need_not_resume(struct device *dev); extern int pm_runtime_force_suspend(struct device *dev); extern int pm_runtime_force_resume(struct device *dev); @@ -244,6 +245,7 @@ static inline bool queue_pm_work(struct work_struct *work) { return false; } static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; } static inline int pm_generic_runtime_resume(struct device *dev) { return 0; } +static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; } static inline int pm_runtime_force_suspend(struct device *dev) { return 0; } static inline int pm_runtime_force_resume(struct device *dev) { return 0; } -- cgit v1.2.3 From 1476bb20eec33bd68b67c7bb7a5d62063af8148d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 27 Feb 2025 11:47:33 +0100 Subject: PM: runtime: Convert pm_runtime_blocked() to static inline The comment in pm_runtime_blocked() is acutally wrong: power.last_status is not a bit field. Its data type is an enum and so one can reasonably assume that partial updates of it will not be observed. Accordingly, pm_runtime_blocked() can be converted to a static inline function and the related locking overhead can be eliminated, so long as it is only used in system suspend/resume code paths because power.last_status is not expected to be updated concurrently while that code is running. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/1923449.tdWV9SEqCh@rjwysocki.net --- include/linux/pm_runtime.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 4c5b74b745d5..7fb5a459847e 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -82,7 +82,6 @@ extern bool pm_runtime_block_if_disabled(struct device *dev); extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); -extern bool pm_runtime_blocked(struct device *dev); extern void pm_runtime_allow(struct device *dev); extern void pm_runtime_forbid(struct device *dev); extern void pm_runtime_no_callbacks(struct device *dev); @@ -200,6 +199,17 @@ static inline bool pm_runtime_enabled(struct device *dev) return !dev->power.disable_depth; } +/** + * pm_runtime_blocked - Check if runtime PM enabling is blocked. + * @dev: Target device. + * + * Do not call this function outside system suspend/resume code paths. + */ +static inline bool pm_runtime_blocked(struct device *dev) +{ + return dev->power.last_status == RPM_BLOCKED; +} + /** * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present. * @dev: Target device. -- cgit v1.2.3 From 3038b22bc098565b93cfb3cc8933b89701feb407 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Feb 2025 17:38:58 +0100 Subject: PM: sleep: Rename power.async_in_progress to power.work_in_progress Rename the async_in_progress field in struct dev_pm_info to work_in_progress as after subsequent changes it will mean work in general rather than just async work. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/3338693.aeNJFYEL58@rjwysocki.net --- include/linux/pm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 24647108f0ad..63a8dffda787 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -679,7 +679,7 @@ struct dev_pm_info { bool wakeup_path:1; bool syscore:1; bool no_pm_callbacks:1; /* Owned by the PM core */ - bool async_in_progress:1; /* Owned by the PM core */ + bool work_in_progress:1; /* Owned by the PM core */ bool smart_suspend:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ -- cgit v1.2.3 From 5f9c23abc47744f2578af4a362655c31254c93b5 Mon Sep 17 00:00:00 2001 From: Paul Benoit Date: Tue, 18 Feb 2025 16:59:32 -0800 Subject: firmware: smccc: Support optional Arm SMCCC SOC_ID name Issue Number 1.6 of the Arm SMC Calling Convention introduces an optional SOC_ID name string. If implemented, point the 'machine' field of the SoC Device Attributes at this string so that it will appear under /sys/bus/soc/devices/soc0/machine. On Arm SMC compliant SoCs, this will allow things like 'lscpu' to eventually get a SoC provider model name from there rather than each tool/utility needing to get a possibly inconsistent, obsolete, or incorrect model/machine name from its own hardcoded model/machine name table. Signed-off-by: Paul Benoit Cc: Mark Rutland Cc: Lorenzo Pieralisi Cc: Sudeep Holla Cc: linux-arm-kernel@lists.infradead.org Acked-by: Mark Rutland Message-Id: <20250219005932.3466-1-paul@os.amperecomputing.com> (sudeep.holla: Dropped regsize variable and used 8 instead as Mark suggested) Signed-off-by: Sudeep Holla --- include/linux/arm-smccc.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 67f6fdf2e7cd..eb7eab04755a 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -639,5 +639,45 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, method; \ }) +#ifdef CONFIG_ARM64 + +#define __fail_smccc_1_2(___res) \ + do { \ + if (___res) \ + ___res->a0 = SMCCC_RET_NOT_SUPPORTED; \ + } while (0) + +/* + * arm_smccc_1_2_invoke() - make an SMCCC v1.2 compliant call + * + * @args: SMC args are in the a0..a17 fields of the arm_smcc_1_2_regs structure + * @res: result values from registers 0 to 17 + * + * This macro will make either an HVC call or an SMC call depending on the + * current SMCCC conduit. If no valid conduit is available then -1 + * (SMCCC_RET_NOT_SUPPORTED) is returned in @res.a0 (if supplied). + * + * The return value also provides the conduit that was used. + */ +#define arm_smccc_1_2_invoke(args, res) ({ \ + struct arm_smccc_1_2_regs *__args = args; \ + struct arm_smccc_1_2_regs *__res = res; \ + int method = arm_smccc_1_1_get_conduit(); \ + switch (method) { \ + case SMCCC_CONDUIT_HVC: \ + arm_smccc_1_2_hvc(__args, __res); \ + break; \ + case SMCCC_CONDUIT_SMC: \ + arm_smccc_1_2_smc(__args, __res); \ + break; \ + default: \ + __fail_smccc_1_2(__res); \ + method = SMCCC_CONDUIT_NONE; \ + break; \ + } \ + method; \ + }) +#endif /*CONFIG_ARM64*/ + #endif /*__ASSEMBLY__*/ #endif /*__LINUX_ARM_SMCCC_H*/ -- cgit v1.2.3 From 9f25b1fb1c93942af870428fa394b42fcda7572f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 6 Feb 2025 14:54:11 -0800 Subject: compiler.h: Introduce __must_be_noncstr() In preparation for adding more type checking to the memtostr/strtomem*() helpers, introduce the ability to check for the "nonstring" attribute. This is the reverse of what was added to strscpy*() in commit 559048d156ff ("string: Check for "nonstring" attribute on strscpy() arguments"). Note that __annotated() must be explicitly tested for, as GCC added __builtin_has_attribute() after it added the "nonstring" attribute. Do so here to avoid the !__annotated() test triggering build failures when __builtin_has_attribute() was missing but __nonstring was defined. (I've opted to squash this fix into this patch so we don't end up with a possible bisection target that would leave the kernel unbuildable.) Reported-by: Venkat Rao Bagalkote Reported-by: Stephen Rothwell Reported-by: Christophe Leroy Reported-by: Michael Kelley Closes: https://lore.kernel.org/all/adbe8dd1-a725-4811-ae7e-76fe770cf096@linux.vnet.ibm.com/ Tested-by: Michael Kelley Signed-off-by: Kees Cook --- include/linux/compiler.h | 18 +++++++++++++++++- include/linux/compiler_types.h | 9 ++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 200fd3c5bc70..d5201464c5e6 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -206,9 +206,25 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __must_be_byte_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_byte_array(a), \ "must be byte array") +/* + * If the "nonstring" attribute isn't available, we have to return true + * so the __must_*() checks pass when "nonstring" isn't supported. + */ +#if __has_attribute(__nonstring__) && defined(__annotated) +#define __is_cstr(a) (!__annotated(a, nonstring)) +#define __is_noncstr(a) (__annotated(a, nonstring)) +#else +#define __is_cstr(a) (true) +#define __is_noncstr(a) (true) +#endif + /* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ #define __must_be_cstr(p) \ - __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") + __BUILD_BUG_ON_ZERO_MSG(!__is_cstr(p), \ + "must be C-string (NUL-terminated)") +#define __must_be_noncstr(p) \ + __BUILD_BUG_ON_ZERO_MSG(!__is_noncstr(p), \ + "must be non-C-string (not NUL-terminated)") #endif /* __KERNEL__ */ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 981cc3d7e3aa..f59393464ea7 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -446,11 +446,14 @@ struct ftrace_likely_data { #define __member_size(p) __builtin_object_size(p, 1) #endif -/* Determine if an attribute has been applied to a variable. */ +/* + * Determine if an attribute has been applied to a variable. + * Using __annotated needs to check for __annotated being available, + * or negative tests may fail when annotation cannot be checked. For + * example, see the definition of __is_cstr(). + */ #if __has_builtin(__builtin_has_attribute) #define __annotated(var, attr) __builtin_has_attribute(var, attr) -#else -#define __annotated(var, attr) (false) #endif /* -- cgit v1.2.3 From 1286f632a50ce03fdf89245183310e2aa9a91522 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Feb 2025 13:41:58 -0800 Subject: string.h: Validate memtostr*()/strtomem*() arguments more carefully Since these functions handle moving between C strings and non-C strings, they should check for the appropriate presence/lack of the nonstring attribute on arguments. Signed-off-by: Kees Cook --- include/linux/string.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index f8e21e80942f..0403a4ca4c11 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -415,8 +415,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define strtomem_pad(dest, src, pad) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_cstr(src) + \ + __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ @@ -439,8 +441,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define strtomem(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_cstr(src) + \ + __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ @@ -459,8 +463,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define memtostr(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_noncstr(src) + \ + __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ @@ -485,8 +491,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define memtostr_pad(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_noncstr(src) + \ + __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ -- cgit v1.2.3 From b56e601afb3fdd0b0c4604a2778e642ababc5414 Mon Sep 17 00:00:00 2001 From: R Sundar Date: Tue, 19 Nov 2024 07:47:18 +0530 Subject: lib/string_choices: Rearrange functions in sorted order Rearrange misplaced functions in sorted order. Suggested-by: Andy Shevchenko Signed-off-by: R Sundar Reviewed-by: Larysa Zaremba Link: https://lore.kernel.org/r/20241119021719.7659-2-prosunofficial@gmail.com Signed-off-by: Kees Cook --- include/linux/string_choices.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index 120ca0f28e95..f3ba4f52ff26 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -41,23 +41,23 @@ static inline const char *str_high_low(bool v) } #define str_low_high(v) str_high_low(!(v)) -static inline const char *str_read_write(bool v) -{ - return v ? "read" : "write"; -} -#define str_write_read(v) str_read_write(!(v)) - static inline const char *str_on_off(bool v) { return v ? "on" : "off"; } #define str_off_on(v) str_on_off(!(v)) -static inline const char *str_yes_no(bool v) +static inline const char *str_read_write(bool v) { - return v ? "yes" : "no"; + return v ? "read" : "write"; } -#define str_no_yes(v) str_yes_no(!(v)) +#define str_write_read(v) str_read_write(!(v)) + +static inline const char *str_true_false(bool v) +{ + return v ? "true" : "false"; +} +#define str_false_true(v) str_true_false(!(v)) static inline const char *str_up_down(bool v) { @@ -65,11 +65,11 @@ static inline const char *str_up_down(bool v) } #define str_down_up(v) str_up_down(!(v)) -static inline const char *str_true_false(bool v) +static inline const char *str_yes_no(bool v) { - return v ? "true" : "false"; + return v ? "yes" : "no"; } -#define str_false_true(v) str_true_false(!(v)) +#define str_no_yes(v) str_yes_no(!(v)) /** * str_plural - Return the simple pluralization based on English counts -- cgit v1.2.3 From 105ca2a2c2ff2c8df0e334d6913d62eec1973dd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Feb 2025 07:44:33 -0800 Subject: block: split struct bio_integrity_payload Many of the fields in struct bio_integrity_payload are only needed for the default integrity buffer in the block layer, and the variable sized array at the end of the structure makes it very hard to embed into caller allocated structures. Reduce struct bio_integrity_payload to the minimal structure needed in common code and create two separate containing structures for the automatically generated payload and the caller allocated payload. The latter is a simple wrapper for struct bio_integrity_payload and the bvecs, while the former contains the additional fields moved out of struct bio_integrity_payload. Always use a dedicated mempool for automatic integrity metadata instead of depending on bio_set that is submitter controlled and thus often doesn't have the mempool initialized and stop using mempools for the submitter buffers as they aren't in the NOIO I/O submission path where we need to guarantee forward progress. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Tested-by: Anuj Gupta Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250225154449.422989-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 25 ++----------------------- include/linux/bio.h | 4 ---- 2 files changed, 2 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 802f52e38efd..0a25716820fe 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -16,8 +16,6 @@ enum bip_flags { }; struct bio_integrity_payload { - struct bio *bip_bio; /* parent bio */ - struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ @@ -25,12 +23,7 @@ struct bio_integrity_payload { unsigned short bip_flags; /* control flags */ u16 app_tag; /* application tag value */ - struct bvec_iter bio_iter; /* for rewinding parent bio */ - - struct work_struct bip_work; /* I/O completion */ - struct bio_vec *bip_vec; - struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ @@ -74,6 +67,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, bip->bip_iter.bi_sector = seed; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs); struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, @@ -85,9 +80,6 @@ bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); -int bioset_integrity_create(struct bio_set *bs, int pool_size); -void bioset_integrity_free(struct bio_set *bs); -void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ @@ -96,15 +88,6 @@ static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) return NULL; } -static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - return 0; -} - -static inline void bioset_integrity_free(struct bio_set *bs) -{ -} - static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; @@ -139,10 +122,6 @@ static inline void bio_integrity_trim(struct bio *bio) { } -static inline void bio_integrity_init(void) -{ -} - static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; diff --git a/include/linux/bio.h b/include/linux/bio.h index 4b79bf50f4f0..cafc7c215de8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -625,10 +625,6 @@ struct bio_set { mempool_t bio_pool; mempool_t bvec_pool; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - mempool_t bio_integrity_pool; - mempool_t bvec_integrity_pool; -#endif unsigned int back_pad; /* -- cgit v1.2.3 From 5d2b978ff9b13b1286c6e1c753b331f4bd98d7ff Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Fri, 21 Feb 2025 18:45:44 +0530 Subject: perf/dwc_pcie: Move common DWC struct definitions to 'pcie-dwc.h' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the common DWC struct definitions, which are shared across all the DesginWare PCIe IPs, to a new header file called 'pcie-dwc.h', so that other users e.g., debugfs, perf and sysfs can make use of them. Signed-off-by: Manivannan Sadhasivam Signed-off-by: Shradha Todi Reviewed-by: Shuai Xue Reviewed-by: Fan Ni Tested-by: Hrishikesh Deleep Link: https://lore.kernel.org/r/20250221131548.59616-2-shradha.t@samsung.com [kwilczynski: commit log, tidy up the new header file] Signed-off-by: Krzysztof Wilczyński --- include/linux/pcie-dwc.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 include/linux/pcie-dwc.h (limited to 'include/linux') diff --git a/include/linux/pcie-dwc.h b/include/linux/pcie-dwc.h new file mode 100644 index 000000000000..6e4e1307ad6e --- /dev/null +++ b/include/linux/pcie-dwc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021-2023 Alibaba Inc. + * Copyright (C) 2025 Linaro Ltd. + * + * Author: Manivannan Sadhasivam + */ + +#ifndef LINUX_PCIE_DWC_H +#define LINUX_PCIE_DWC_H + +#include + +struct dwc_pcie_vsec_id { + u16 vendor_id; + u16 vsec_id; + u8 vsec_rev; +}; + +/* + * VSEC IDs are allocated by the vendor, so a given ID may mean different + * things to different vendors. See PCIe r6.0, sec 7.9.5.2. + */ +static const struct dwc_pcie_vsec_id dwc_pcie_rasdes_vsec_ids[] = { + { .vendor_id = PCI_VENDOR_ID_ALIBABA, + .vsec_id = 0x02, .vsec_rev = 0x4 }, + { .vendor_id = PCI_VENDOR_ID_AMPERE, + .vsec_id = 0x02, .vsec_rev = 0x4 }, + { .vendor_id = PCI_VENDOR_ID_QCOM, + .vsec_id = 0x02, .vsec_rev = 0x4 }, + {} +}; + +#endif /* LINUX_PCIE_DWC_H */ -- cgit v1.2.3 From a6687c8ff613fc13a71ce1390593ba8d27c52db9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 3 Mar 2025 17:28:05 +0000 Subject: slab: Mark large folios for debugging purposes If a user calls p = kmalloc(1024); kfree(p); kfree(p); and 'p' was the only object in the slab, we may free the slab after the first call to kfree(). If we do, we clear PGTY_slab and the second call to kfree() will call free_large_kmalloc(). That will leave a trace in the logs ("object pointer: 0x%p"), but otherwise proceed to free the memory, which is likely to corrupt the page allocator's metadata. Allocate a new page type for large kmalloc and mark the memory with it while it's allocated. That lets us detect this double-free and return without harming any data structures. Reported-by: Hannes Reinecke Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/page-flags.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 36d283552f80..df9234e5f478 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -925,14 +925,15 @@ FOLIO_FLAG_FALSE(has_hwpoisoned) enum pagetype { /* 0x00-0x7f are positive numbers, ie mapcount */ /* Reserve 0x80-0xef for mapcount overflow. */ - PGTY_buddy = 0xf0, - PGTY_offline = 0xf1, - PGTY_table = 0xf2, - PGTY_guard = 0xf3, - PGTY_hugetlb = 0xf4, - PGTY_slab = 0xf5, - PGTY_zsmalloc = 0xf6, - PGTY_unaccepted = 0xf7, + PGTY_buddy = 0xf0, + PGTY_offline = 0xf1, + PGTY_table = 0xf2, + PGTY_guard = 0xf3, + PGTY_hugetlb = 0xf4, + PGTY_slab = 0xf5, + PGTY_zsmalloc = 0xf6, + PGTY_unaccepted = 0xf7, + PGTY_large_kmalloc = 0xf8, PGTY_mapcount_underflow = 0xff }; @@ -1075,6 +1076,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) +FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs -- cgit v1.2.3 From f9fde814de3755d5d3818fe51244c45699f7252e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 21 Feb 2025 14:13:07 +0100 Subject: fs: support getname_maybe_null() in move_mount() Allow move_mount() to work with NULL path arguments. Link: https://lore.kernel.org/r/20250221-brauner-open_tree-v1-8-dbcfcb98c676@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e71d58c7f59c..7e9df867538d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2855,6 +2855,7 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f return __getname_maybe_null(name); } extern void putname(struct filename *name); +DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T)) extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); -- cgit v1.2.3 From c70ca298036c58a88686ff388d3d367e9d21acf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:13 +0100 Subject: perf/core: Simplify the perf_event_alloc() error path The error cleanup sequence in perf_event_alloc() is a subset of the existing _free_event() function (it must of course be). Split this out into __free_event() and simplify the error path. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135517.967889521@infradead.org --- include/linux/perf_event.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c4525bae2fe9..8c0117bcbdb9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -673,13 +673,15 @@ struct swevent_hlist { struct rcu_head rcu_head; }; -#define PERF_ATTACH_CONTEXT 0x01 -#define PERF_ATTACH_GROUP 0x02 -#define PERF_ATTACH_TASK 0x04 -#define PERF_ATTACH_TASK_DATA 0x08 -#define PERF_ATTACH_ITRACE 0x10 -#define PERF_ATTACH_SCHED_CB 0x20 -#define PERF_ATTACH_CHILD 0x40 +#define PERF_ATTACH_CONTEXT 0x0001 +#define PERF_ATTACH_GROUP 0x0002 +#define PERF_ATTACH_TASK 0x0004 +#define PERF_ATTACH_TASK_DATA 0x0008 +#define PERF_ATTACH_ITRACE 0x0010 +#define PERF_ATTACH_SCHED_CB 0x0020 +#define PERF_ATTACH_CHILD 0x0040 +#define PERF_ATTACH_EXCLUSIVE 0x0080 +#define PERF_ATTACH_CALLCHAIN 0x0100 struct bpf_prog; struct perf_cgroup; -- cgit v1.2.3 From 6c8b0b835f003647e593c08331a4dd2150d5eb0e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:15 +0100 Subject: perf/core: Simplify perf_pmu_register() Using the previously introduced perf_pmu_free() and a new IDR helper, simplify the perf_pmu_register error paths. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135518.198937277@infradead.org --- include/linux/idr.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index da5f5fa4a3a6..cd729be369b3 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -15,6 +15,7 @@ #include #include #include +#include struct idr { struct radix_tree_root idr_rt; @@ -124,6 +125,22 @@ void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); +struct __class_idr { + struct idr *idr; + int id; +}; + +#define idr_null ((struct __class_idr){ NULL, -1 }) +#define take_idr_id(id) __get_and_null(id, idr_null) + +DEFINE_CLASS(idr_alloc, struct __class_idr, + if (_T.id >= 0) idr_remove(_T.idr, _T.id), + ((struct __class_idr){ + .idr = idr, + .id = idr_alloc(idr, ptr, start, end, gfp), + }), + struct idr *idr, void *ptr, int start, int end, gfp_t gfp); + /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. -- cgit v1.2.3 From 4baeb0687abf5eca3f7ab8b147c27cce82ec49ea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:18 +0100 Subject: perf/core: Merge struct pmu::pmu_disable_count into struct perf_cpu_pmu_context::pmu_disable_count Because it makes no sense to have two per-cpu allocations per pmu. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135518.518730578@infradead.org --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8c0117bcbdb9..5f293e679ab6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -343,7 +343,6 @@ struct pmu { */ unsigned int scope; - int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; @@ -1031,6 +1030,7 @@ struct perf_cpu_pmu_context { int active_oncpu; int exclusive; + int pmu_disable_count; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; -- cgit v1.2.3 From 4eabf533fb1886089ef57e0c8ec52048b1741e39 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2024 14:39:20 +0100 Subject: perf/core: Detach 'struct perf_cpu_pmu_context' and 'struct pmu' lifetimes In prepration for being able to unregister a PMU with existing events, it becomes important to detach struct perf_cpu_pmu_context lifetimes from that of struct pmu. Notably struct perf_cpu_pmu_context embeds a struct perf_event_pmu_context that can stay referenced until the last event goes. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241104135518.760214287@infradead.org --- include/linux/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5f293e679ab6..76f4265efee9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -343,7 +343,7 @@ struct pmu { */ unsigned int scope; - struct perf_cpu_pmu_context __percpu *cpu_pmu_context; + struct perf_cpu_pmu_context __percpu **cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; @@ -922,7 +922,7 @@ struct perf_event_pmu_context { struct list_head pinned_active; struct list_head flexible_active; - /* Used to avoid freeing per-cpu perf_event_pmu_context */ + /* Used to identify the per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; -- cgit v1.2.3 From 46af8e2406c27cc2f21094983697ff872102065f Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Mar 2025 00:04:08 +0100 Subject: pipe: cache 2 pages instead of 1 User data is kept in a circular buffer backed by pages allocated as needed. Only having space for one spare is still prone to having to resort to allocation / freeing. In my testing this decreases page allocs by 60% during a kernel build. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250303230409.452687-3-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/pipe_fs_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 8ff23bf5a819..eb7994a1ff93 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -72,7 +72,7 @@ struct pipe_inode_info { #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif - struct page *tmp_page; + struct page *tmp_page[2]; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; -- cgit v1.2.3 From 84654c7f47307692d47ea914d01287c8c54b3532 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Mar 2025 00:04:09 +0100 Subject: wait: avoid spurious calls to prepare_to_wait_event() in ___wait_event() In vast majority of cases the condition determining whether the thread can proceed is true after the first wake up. However, even in that case the thread ends up calling into prepare_to_wait_event() again, suffering a spurious irq + lock trip. Then it calls into finish_wait() to unlink itself. Note that in case of a pending signal the work done by prepare_to_wait_event() gets ignored even without the change. pre-check the condition after waking up instead. Stats gathared during a kernel build: bpftrace -e 'kprobe:prepare_to_wait_event,kprobe:finish_wait \ { @[probe] = count(); }' @[kprobe:finish_wait]: 392483 @[kprobe:prepare_to_wait_event]: 778690 As in calls to prepare_to_wait_event() almost double calls to finish_wait(). This evens out with the patch. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250303230409.452687-4-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/wait.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 6d90ad974408..3503fe822e38 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -316,6 +316,9 @@ extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); } \ \ cmd; \ + \ + if (condition) \ + break; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ -- cgit v1.2.3 From 4f2a0b765c9731d2fa94e209ee9ae0e96b280f17 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:41 +0100 Subject: : Cover all possible x86 CPU cache sizes Add size macros for 24/192/384 Kilobytes and 3/6/12/18/24 Megabytes. With that, the x86 subsystem can avoid locally defining its own macros for CPU cache sizes. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20250304085152.51092-31-darwi@linutronix.de --- include/linux/sizes.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sizes.h b/include/linux/sizes.h index c3a00b967d18..49039494076f 100644 --- a/include/linux/sizes.h +++ b/include/linux/sizes.h @@ -23,17 +23,25 @@ #define SZ_4K 0x00001000 #define SZ_8K 0x00002000 #define SZ_16K 0x00004000 +#define SZ_24K 0x00006000 #define SZ_32K 0x00008000 #define SZ_64K 0x00010000 #define SZ_128K 0x00020000 +#define SZ_192K 0x00030000 #define SZ_256K 0x00040000 +#define SZ_384K 0x00060000 #define SZ_512K 0x00080000 #define SZ_1M 0x00100000 #define SZ_2M 0x00200000 +#define SZ_3M 0x00300000 #define SZ_4M 0x00400000 +#define SZ_6M 0x00600000 #define SZ_8M 0x00800000 +#define SZ_12M 0x00c00000 #define SZ_16M 0x01000000 +#define SZ_18M 0x01200000 +#define SZ_24M 0x01800000 #define SZ_32M 0x02000000 #define SZ_64M 0x04000000 #define SZ_128M 0x08000000 -- cgit v1.2.3 From bd3ce71078bde4ecbfc60d49c96d1c55de0635cc Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 25 Feb 2025 20:40:34 +0100 Subject: gpiolib: of: Handle threecell GPIO chips When describing GPIO controllers in the device tree, the ambition of device tree to describe the hardware may require a three-cell scheme: gpios = <&gpio instance offset flags>; This implements support for this scheme in the gpiolib OF core. Drivers that want to handle multiple gpiochip instances from one OF node need to implement a callback similar to this to determine if a certain gpio chip is a pointer to the right instance (pseudo-code): struct my_gpio { struct gpio_chip gcs[MAX_CHIPS]; }; static bool my_of_node_instance_match(struct gpio_chip *gc unsigned int instance) { struct my_gpio *mg = gpiochip_get_data(gc); if (instance >= MAX_CHIPS) return false; return (gc == &mg->gcs[instance]); } probe() { struct my_gpio *mg; struct gpio_chip *gc; int i, ret; for (i = 0; i++; i < MAX_CHIPS) { gc = &mg->gcs[i]; /* This tells gpiolib we have several instances per node */ gc->of_gpio_n_cells = 3; gc->of_node_instance_match = my_of_node_instance_match; gc->base = -1; ... ret = devm_gpiochip_add_data(dev, gc, mg); if (ret) return ret; } } Rename the "simple" of_xlate function to "twocell" which is closer to what it actually does. In the device tree bindings, the provide node needs to specify #gpio-cells = <3>; where the first cell is the instance number: gpios = <&gpio instance offset flags>; Conversely ranges need to have four cells: gpio-ranges = <&pinctrl instance gpio_offset pin_offset count>; Reviewed-by: Alex Elder Tested-by: Yixun Lan Signed-off-by: Linus Walleij Reviewed-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250225-gpio-ranges-fourcell-v3-2-860382ba4713@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 783897d94be8..83e0a7e86962 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -531,10 +531,32 @@ struct gpio_chip { /** * @of_gpio_n_cells: * - * Number of cells used to form the GPIO specifier. + * Number of cells used to form the GPIO specifier. The standard is 2 + * cells: + * + * gpios = <&gpio offset flags>; + * + * some complex GPIO controllers instantiate more than one chip per + * device tree node and have 3 cells: + * + * gpios = <&gpio instance offset flags>; + * + * Legacy GPIO controllers may even have 1 cell: + * + * gpios = <&gpio offset>; */ unsigned int of_gpio_n_cells; + /** + * of_node_instance_match: + * + * Determine if a chip is the right instance. Must be implemented by + * any driver using more than one gpio_chip per device tree node. + * Returns true if gc is the instance indicated by i (which is the + * first cell in the phandles for GPIO lines and gpio-ranges). + */ + bool (*of_node_instance_match)(struct gpio_chip *gc, unsigned int i); + /** * @of_xlate: * -- cgit v1.2.3 From f2cb43c98010181b532ff36643731dc2442b9b7d Mon Sep 17 00:00:00 2001 From: Cheng Ming Lin Date: Mon, 24 Feb 2025 15:03:48 +0800 Subject: mtd: spinand: Add read retry support When the host ECC fails to correct the data error of NAND device, there's a special read for data recovery method which can be setup by the host for the next read. There are several retry levels that can be attempted until the lost data is recovered or definitely assumed lost. Signed-off-by: Cheng Ming Lin Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 73424405232a..5837a09ab9d8 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -452,6 +452,8 @@ struct spinand_user_otp { * @set_cont_read: enable/disable continuous cached reads * @fact_otp: SPI NAND factory OTP info. * @user_otp: SPI NAND user OTP info. + * @read_retries: the number of read retry modes supported + * @set_read_retry: enable/disable read retry for data recovery * * Each SPI NAND manufacturer driver should have a spinand_info table * describing all the chips supported by the driver. @@ -474,6 +476,9 @@ struct spinand_info { bool enable); struct spinand_fact_otp fact_otp; struct spinand_user_otp user_otp; + unsigned int read_retries; + int (*set_read_retry)(struct spinand_device *spinand, + unsigned int read_retry); }; #define SPINAND_ID(__method, ...) \ @@ -520,6 +525,10 @@ struct spinand_info { .ops = __ops, \ } +#define SPINAND_READ_RETRY(__read_retries, __set_read_retry) \ + .read_retries = __read_retries, \ + .set_read_retry = __set_read_retry, + #define SPINAND_INFO(__model, __id, __memorg, __eccreq, __op_variants, \ __flags, ...) \ { \ @@ -572,6 +581,8 @@ struct spinand_dirmap { * @priv: manufacturer private data * @fact_otp: SPI NAND factory OTP info. * @user_otp: SPI NAND user OTP info. + * @read_retries: the number of read retry modes supported + * @set_read_retry: Enable/disable the read retry feature */ struct spinand_device { struct nand_device base; @@ -607,6 +618,10 @@ struct spinand_device { const struct spinand_fact_otp *fact_otp; const struct spinand_user_otp *user_otp; + + unsigned int read_retries; + int (*set_read_retry)(struct spinand_device *spinand, + unsigned int retry_mode); }; /** -- cgit v1.2.3 From 0c493da86374dffff7505e67289ad75b21f5b301 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 28 Feb 2025 11:20:56 +0100 Subject: net: rename netns_local to netns_immutable The name 'netns_local' is confusing. A following commit will export it via netlink, so let's use a more explicit name. Reported-by: Eric Dumazet Suggested-by: Kuniyuki Iwashima Signed-off-by: Nicolas Dichtel Reviewed-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 26a0c4e4d963..b8728d67ea91 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2021,7 +2021,7 @@ enum netdev_reg_state { * regardless of source, even if those aren't * HWTSTAMP_SOURCE_NETDEV * @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN - * @netns_local: interface can't change network namespaces + * @netns_immutable: interface can't change network namespaces * @fcoe_mtu: device supports maximum FCoE MTU, 2158 bytes * * @net_notifier_list: List of per-net netdev notifier block @@ -2429,7 +2429,7 @@ struct net_device { /* priv_flags_slow, ungrouped to save space */ unsigned long see_all_hwtstamp_requests:1; unsigned long change_proto_down:1; - unsigned long netns_local:1; + unsigned long netns_immutable:1; unsigned long fcoe_mtu:1; struct list_head net_notifier_list; -- cgit v1.2.3 From 12b6f7069ba5aa160f3332916408b34ae8e0b0f6 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 28 Feb 2025 11:20:58 +0100 Subject: net: plumb extack in __dev_change_net_namespace() It could be hard to understand why the netlink command fails. For example, if dev->netns_immutable is set, the error is "Invalid argument". Signed-off-by: Nicolas Dichtel Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b8728d67ea91..7ab86ec228b7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4191,12 +4191,13 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex); + const char *pat, int new_ifindex, + struct netlink_ext_ack *extack); static inline int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - return __dev_change_net_namespace(dev, net, pat, 0); + return __dev_change_net_namespace(dev, net, pat, 0, NULL); } int __dev_set_mtu(struct net_device *, int); int dev_set_mtu(struct net_device *, int); -- cgit v1.2.3 From 71cbbb7149e3de8c39dfe8a97eaa7f1cbcbff52f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 4 Mar 2025 14:18:14 +0100 Subject: irqchip/davinci-cp-intc: Remove public header There are no more users of irq-davinci-cp-intc.h (da830.c doesn't use any of its symbols). Remove the header and make the driver stop using the config structure. [ tglx: Mop up coding style ] Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250304131815.86549-1-brgl@bgdev.pl --- include/linux/irqchip/irq-davinci-cp-intc.h | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 include/linux/irqchip/irq-davinci-cp-intc.h (limited to 'include/linux') diff --git a/include/linux/irqchip/irq-davinci-cp-intc.h b/include/linux/irqchip/irq-davinci-cp-intc.h deleted file mode 100644 index 8d71ed5b5a61..000000000000 --- a/include/linux/irqchip/irq-davinci-cp-intc.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2019 Texas Instruments - */ - -#ifndef _LINUX_IRQ_DAVINCI_CP_INTC_ -#define _LINUX_IRQ_DAVINCI_CP_INTC_ - -#include - -/** - * struct davinci_cp_intc_config - configuration data for davinci-cp-intc - * driver. - * - * @reg: register range to map - * @num_irqs: number of HW interrupts supported by the controller - */ -struct davinci_cp_intc_config { - struct resource reg; - unsigned int num_irqs; -}; - -int davinci_cp_intc_init(const struct davinci_cp_intc_config *config); - -#endif /* _LINUX_IRQ_DAVINCI_CP_INTC_ */ -- cgit v1.2.3 From dc872c5f527a07124b2ad92d8c70d11829f17ac9 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:22 +0800 Subject: Coresight: Add support for new APB clock name Add support for new APB clock-name. If the function fails to obtain the clock with the name "apb_pclk", it will attempt to acquire the clock with the name "apb". Reviewed-by: James Clark Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-2-quic_jiegan@quicinc.com --- include/linux/coresight.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index c7614ccb3232..2e493d5288d8 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -471,8 +471,11 @@ static inline struct clk *coresight_get_enable_apb_pclk(struct device *dev) int ret; pclk = clk_get(dev, "apb_pclk"); - if (IS_ERR(pclk)) - return NULL; + if (IS_ERR(pclk)) { + pclk = clk_get(dev, "apb"); + if (IS_ERR(pclk)) + return NULL; + } ret = clk_prepare_enable(pclk); if (ret) { -- cgit v1.2.3 From c367a89dec267c65b556ace5d6aafd07fb1d66b1 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:23 +0800 Subject: Coresight: Add trace_id function to retrieving the trace ID Add 'trace_id' function pointer in coresight_ops. It's responsible for retrieving the device's trace ID. Co-developed-by: James Clark Signed-off-by: James Clark Reviewed-by: James Clark Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-3-quic_jiegan@quicinc.com --- include/linux/coresight.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2e493d5288d8..1396bda77f3f 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -335,6 +335,7 @@ enum cs_mode { CS_MODE_PERF, }; +#define coresight_ops(csdev) csdev->ops #define source_ops(csdev) csdev->ops->source_ops #define sink_ops(csdev) csdev->ops->sink_ops #define link_ops(csdev) csdev->ops->link_ops @@ -421,6 +422,8 @@ struct coresight_ops_panic { }; struct coresight_ops { + int (*trace_id)(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_device *sink); const struct coresight_ops_sink *sink_ops; const struct coresight_ops_link *link_ops; const struct coresight_ops_source *source_ops; @@ -713,4 +716,6 @@ int coresight_init_driver(const char *drv, struct amba_driver *amba_drv, void coresight_remove_driver(struct amba_driver *amba_drv, struct platform_driver *pdev_drv); +int coresight_etm_get_trace_id(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_device *sink); #endif /* _LINUX_COREISGHT_H */ -- cgit v1.2.3 From 533c20b062d7c25cbcbadb31e3ecb95a08ddb877 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 27 Feb 2025 13:58:05 -0500 Subject: ftrace: Add print_function_args() Add a function to decode argument types with the help of BTF. Will be used to display arguments in the function and function graph tracer. It can only handle simply arguments and up to FTRACE_REGS_MAX_ARGS number of arguments. When it hits a max, it will print ", ...": page_to_skb(vi=0xffff8d53842dc980, rq=0xffff8d53843a0800, page=0xfffffc2e04337c00, offset=6160, len=64, truesize=1536, ...) And if it hits an argument that is not recognized, it will print the raw value and the type of argument it is: make_vfsuid(idmap=0xffffffff87f99db8, fs_userns=0xffffffff87e543c0, kuid=0x0 (STRUCT)) __pti_set_user_pgtbl(pgdp=0xffff8d5384ab47f8, pgd=0x110e74067 (STRUCT)) Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Guo Ren Cc: Donglin Peng Cc: Zheng Yejian Link: https://lore.kernel.org/20250227185822.639418500@goodmis.org Reviewed-by: Masami Hiramatsu (Google) Co-developed-by: Steven Rostedt (Google) Signed-off-by: Sven Schnelle Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace_regs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h index bbc1873ca6b8..15627ceea9bc 100644 --- a/include/linux/ftrace_regs.h +++ b/include/linux/ftrace_regs.h @@ -35,4 +35,9 @@ struct ftrace_regs; #endif /* HAVE_ARCH_FTRACE_REGS */ +/* This can be overridden by the architectures */ +#ifndef FTRACE_REGS_MAX_ARGS +# define FTRACE_REGS_MAX_ARGS 6 +#endif + #endif /* _LINUX_FTRACE_REGS_H */ -- cgit v1.2.3 From 36e1d6344aca13e1f20af099561db75f28649151 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:36 +0000 Subject: mm: Remove wait_for_stable_page() The last caller has been converted to call folio_wait_stable(), so we can remove this wrapper. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..a19d8e334194 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1256,7 +1256,6 @@ void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); -void wait_for_stable_page(struct page *page); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); -- cgit v1.2.3 From e33ce6bd4ea2703444509cafb3646a547573fbc3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:51:45 +0000 Subject: mm: Remove grab_cache_page_write_begin() All callers have now been converted to use folios, so remove this compatibility wrapper. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/pagemap.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a19d8e334194..45817e2106ee 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -990,9 +990,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); -struct page *grab_cache_page_write_begin(struct address_space *mapping, - pgoff_t index); - /* * Returns locked page at given index in given cache, creating it if needed. */ -- cgit v1.2.3 From d96e2802a802bea49973f82d921b45de910ecaca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Feb 2025 05:52:01 +0000 Subject: mm: Remove wait_on_page_locked() This compatibility wrapper has no callers left, so remove it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 45817e2106ee..b8c6fa320ee3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1242,11 +1242,6 @@ static inline int folio_wait_locked_killable(struct folio *folio) return folio_wait_bit_killable(folio, PG_locked); } -static inline void wait_on_page_locked(struct page *page) -{ - folio_wait_locked(page_folio(page)); -} - void folio_end_read(struct folio *folio, bool success); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); -- cgit v1.2.3 From 3c03c49b2fa59c4d456e2d673fe5847fa6cbcdf2 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:25 +0800 Subject: Coresight: Introduce a new struct coresight_path Introduce a new strcuture, 'struct coresight_path', to store the data that utilized by the devices in the path. The coresight_path will be built/released by coresight_build_path/coresight_release_path functions. Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-5-quic_jiegan@quicinc.com --- include/linux/coresight.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 1396bda77f3f..509e1f256412 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -329,6 +329,16 @@ static struct coresight_dev_list (var) = { \ #define to_coresight_device(d) container_of(d, struct coresight_device, dev) +/** + * struct coresight_path - data needed by enable/disable path + * @path_list: path from source to sink. + * @trace_id: trace_id of the whole path. + */ +struct coresight_path { + struct list_head path_list; + u8 trace_id; +}; + enum cs_mode { CS_MODE_DISABLED, CS_MODE_SYSFS, -- cgit v1.2.3 From 7b365f056d8e02fc70c823bdf736e41a7236a54b Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:27 +0800 Subject: Coresight: Change to read the trace ID from coresight_path The source device can directly read the trace ID from the coresight_path which result in etm_read_alloc_trace_id and etm4_read_alloc_trace_id being deleted. Co-developed-by: James Clark Signed-off-by: James Clark Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-7-quic_jiegan@quicinc.com --- include/linux/coresight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 509e1f256412..bc0c853ffa6d 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -401,7 +401,7 @@ struct coresight_ops_link { struct coresight_ops_source { int (*cpu_id)(struct coresight_device *csdev); int (*enable)(struct coresight_device *csdev, struct perf_event *event, - enum cs_mode mode, struct coresight_trace_id_map *id_map); + enum cs_mode mode, struct coresight_path *path); void (*disable)(struct coresight_device *csdev, struct perf_event *event); }; -- cgit v1.2.3 From f78d206f3d73446e4c3568946a26f903da2149b4 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Mon, 3 Mar 2025 11:29:30 +0800 Subject: Coresight: Add Coresight TMC Control Unit driver The Coresight TMC Control Unit hosts miscellaneous configuration registers which control various features related to TMC ETR sink. Based on the trace ID, which is programmed in the related CTCU ATID register of a specific ETR, trace data with that trace ID gets into the ETR buffer, while other trace data gets dropped. Enabling source device sets one bit of the ATID register based on source device's trace ID. Disabling source device resets the bit according to the source device's trace ID. Reviewed-by: James Clark Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250303032931.2500935-10-quic_jiegan@quicinc.com --- include/linux/coresight.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index bc0c853ffa6d..89b781e70fe7 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -71,7 +71,8 @@ enum coresight_dev_subtype_source { enum coresight_dev_subtype_helper { CORESIGHT_DEV_SUBTYPE_HELPER_CATU, - CORESIGHT_DEV_SUBTYPE_HELPER_ECT_CTI + CORESIGHT_DEV_SUBTYPE_HELPER_ECT_CTI, + CORESIGHT_DEV_SUBTYPE_HELPER_CTCU, }; /** -- cgit v1.2.3 From 3d252160b818045f3a152b13756f6f37ca34639d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 4 Mar 2025 13:51:38 +0000 Subject: fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex pipe_readable(), pipe_writable(), and pipe_poll() can read "pipe->head" and "pipe->tail" outside of "pipe->mutex" critical section. When the head and the tail are read individually in that order, there is a window for interruption between the two reads in which both the head and the tail can be updated by concurrent readers and writers. One of the problematic scenarios observed with hackbench running multiple groups on a large server on a particular pipe inode is as follows: pipe->head = 36 pipe->tail = 36 hackbench-118762 [057] ..... 1029.550548: pipe_write: *wakes up: pipe not full* hackbench-118762 [057] ..... 1029.550548: pipe_write: head: 36 -> 37 [tail: 36] hackbench-118762 [057] ..... 1029.550548: pipe_write: *wake up next reader 118740* hackbench-118762 [057] ..... 1029.550548: pipe_write: *wake up next writer 118768* hackbench-118768 [206] ..... 1029.55055X: pipe_write: *writer wakes up* hackbench-118768 [206] ..... 1029.55055X: pipe_write: head = READ_ONCE(pipe->head) [37] ... CPU 206 interrupted (exact wakeup was not traced but 118768 did read head at 37 in traces) hackbench-118740 [057] ..... 1029.550558: pipe_read: *reader wakes up: pipe is not empty* hackbench-118740 [057] ..... 1029.550558: pipe_read: tail: 36 -> 37 [head = 37] hackbench-118740 [057] ..... 1029.550559: pipe_read: *pipe is empty; wakeup writer 118768* hackbench-118740 [057] ..... 1029.550559: pipe_read: *sleeps* hackbench-118766 [185] ..... 1029.550592: pipe_write: *New writer comes in* hackbench-118766 [185] ..... 1029.550592: pipe_write: head: 37 -> 38 [tail: 37] hackbench-118766 [185] ..... 1029.550592: pipe_write: *wakes up reader 118766* hackbench-118740 [185] ..... 1029.550598: pipe_read: *reader wakes up; pipe not empty* hackbench-118740 [185] ..... 1029.550599: pipe_read: tail: 37 -> 38 [head: 38] hackbench-118740 [185] ..... 1029.550599: pipe_read: *pipe is empty* hackbench-118740 [185] ..... 1029.550599: pipe_read: *reader sleeps; wakeup writer 118768* ... CPU 206 switches back to writer hackbench-118768 [206] ..... 1029.550601: pipe_write: tail = READ_ONCE(pipe->tail) [38] hackbench-118768 [206] ..... 1029.550601: pipe_write: pipe_full()? (u32)(37 - 38) >= 16? Yes hackbench-118768 [206] ..... 1029.550601: pipe_write: *writer goes back to sleep* [ Tasks 118740 and 118768 can then indefinitely wait on each other. ] The unsigned arithmetic in pipe_occupancy() wraps around when "pipe->tail > pipe->head" leading to pipe_full() returning true despite the pipe being empty. The case of genuine wraparound of "pipe->head" is handled since pipe buffer has data allowing readers to make progress until the pipe->tail wraps too after which the reader will wakeup a sleeping writer, however, mistaking the pipe to be full when it is in fact empty can lead to readers and writers waiting on each other indefinitely. This issue became more problematic and surfaced as a hang in hackbench after the optimization in commit aaec5a95d596 ("pipe_read: don't wake up the writer if the pipe is still full") significantly reduced the number of spurious wakeups of writers that had previously helped mask the issue. To avoid missing any updates between the reads of "pipe->head" and "pipe->write", unionize the two with a single unsigned long "pipe->head_tail" member that can be loaded atomically. Using "pipe->head_tail" to read the head and the tail ensures the lockless checks do not miss any updates to the head or the tail and since those two are only updated under "pipe->mutex", it ensures that the head is always ahead of, or equal to the tail resulting in correct calculations. [ prateek: commit log, testing on x86 platforms. ] Reported-and-debugged-by: Swapnil Sapkal Closes: https://lore.kernel.org/lkml/e813814e-7094-4673-bc69-731af065a0eb@amd.com/ Reported-by: Alexey Gladkov Closes: https://lore.kernel.org/all/Z8Wn0nTvevLRG_4m@example.org/ Fixes: 8cefc107ca54 ("pipe: Use head and tail pointers for the ring, not cursor and length") Tested-by: Swapnil Sapkal Reviewed-by: Oleg Nesterov Tested-by: Alexey Gladkov Signed-off-by: K Prateek Nayak Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 8ff23bf5a819..3cc4f8eab853 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -31,6 +31,33 @@ struct pipe_buffer { unsigned long private; }; +/* + * Really only alpha needs 32-bit fields, but + * might as well do it for 64-bit architectures + * since that's what we've historically done, + * and it makes 'head_tail' always be a simple + * 'unsigned long'. + */ +#ifdef CONFIG_64BIT +typedef unsigned int pipe_index_t; +#else +typedef unsigned short pipe_index_t; +#endif + +/* + * We have to declare this outside 'struct pipe_inode_info', + * but then we can't use 'union pipe_index' for an anonymous + * union, so we end up having to duplicate this declaration + * below. Annoying. + */ +union pipe_index { + unsigned long head_tail; + struct { + pipe_index_t head; + pipe_index_t tail; + }; +}; + /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing @@ -58,8 +85,16 @@ struct pipe_buffer { struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - unsigned int head; - unsigned int tail; + + /* This has to match the 'union pipe_index' above */ + union { + unsigned long head_tail; + struct { + pipe_index_t head; + pipe_index_t tail; + }; + }; + unsigned int max_usage; unsigned int ring_size; unsigned int nr_accounted; -- cgit v1.2.3 From ab2bb9c084f7e3b641ceba91efc33b3adcd1846e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 3 Mar 2025 11:52:36 -0500 Subject: percpu: Introduce percpu hot section Add a subsection to the percpu data for frequently accessed variables that should remain cached on each processor. These varables should not be accessed from other processors to avoid cacheline bouncing. This will replace the pcpu_hot struct on x86, and open up similar functionality to other architectures and the kernel core. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Acked-by: Uros Bizjak Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250303165246.2175811-2-brgerst@gmail.com --- include/linux/percpu-defs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 40d34e032d5b..0fcacb909778 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -112,6 +112,19 @@ #define DEFINE_PER_CPU(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "") +/* + * Declaration/definition used for per-CPU variables that are frequently + * accessed and should be in a single cacheline. + * + * For use only by architecture and core code. Only use scalar or pointer + * types to maximize density. + */ +#define DECLARE_PER_CPU_CACHE_HOT(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..hot.." #name) + +#define DEFINE_PER_CPU_CACHE_HOT(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..hot.." #name) + /* * Declaration/definition used for per-CPU variables that must be cacheline * aligned under SMP conditions so that, whilst a particular instance of the -- cgit v1.2.3 From 46e8fff6d45fe44cb0939c9339b6d5936551b1e4 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 3 Mar 2025 11:52:38 -0500 Subject: x86/preempt: Move preempt count to percpu hot section No functional change. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar Acked-by: Uros Bizjak Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250303165246.2175811-4-brgerst@gmail.com --- include/linux/preempt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index ca86235ac15c..4c1af9b7e28b 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -319,6 +319,7 @@ do { \ #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; +struct task_struct; /** * preempt_ops - notifiers called when a task is preempted and rescheduled -- cgit v1.2.3 From 778b94d7ac17b5800aa857222911f09cc986b509 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 28 Feb 2025 11:01:53 -0600 Subject: ACPI: platform_profile: Add support for hidden choices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two drivers don't support all the same profiles the legacy interface only exports the common profiles. This causes problems for cases where one driver uses low-power but another uses quiet because the result is that neither is exported to sysfs. To allow two drivers to disagree, add support for "hidden choices". Hidden choices are platform profiles that a driver supports to be compatible with the platform profile of another driver. Fixes: 688834743d67 ("ACPI: platform_profile: Allow multiple handlers") Reported-by: Antheas Kapenekakis Closes: https://lore.kernel.org/platform-driver-x86/e64b771e-3255-42ad-9257-5b8fc6c24ac9@gmx.de/T/#mc068042dd29df36c16c8af92664860fc4763974b Signed-off-by: Mario Limonciello Tested-by: Antheas Kapenekakis Tested-by: Derek J. Clark Acked-by: Ilpo Järvinen Link: https://patch.msgid.link/20250228170155.2623386-2-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- include/linux/platform_profile.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8ab5b0e8eb2c..8c9df7dadd5d 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -33,6 +33,8 @@ enum platform_profile_option { * @probe: Callback to setup choices available to the new class device. These * choices will only be enforced when setting a new profile, not when * getting the current one. + * @hidden_choices: Callback to setup choices that are not visible to the user + * but can be set by the driver. * @profile_get: Callback that will be called when showing the current platform * profile in sysfs. * @profile_set: Callback that will be called when storing a new platform @@ -40,6 +42,7 @@ enum platform_profile_option { */ struct platform_profile_ops { int (*probe)(void *drvdata, unsigned long *choices); + int (*hidden_choices)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; -- cgit v1.2.3 From 5c8265fa63e4c60cd80f28bb0a682cf97b8a60e9 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 25 Feb 2025 18:06:01 +0100 Subject: PCI: hotplug: Drop superfluous pci_hotplug_slot_list The PCI hotplug core keeps a list of all registered slots. Its sole purpose is to WARN() on slot removal if another slot is using the same name. But this can never happen because already on slot creation, an error is returned and multiple messages are emitted if a slot's name is duplicated: pci_hp_register() __pci_hp_register() __pci_hp_initialize() pci_create_slot() kobject_init_and_add() kobject_add_varg() kobject_add_internal() create_dir() sysfs_create_dir_ns() kernfs_create_dir_ns() sysfs_warn_dup() pr_warn("cannot create duplicate filename ...") pr_err("%s failed for %s with -EEXIST, ..."); Drop the superfluous list. Link: https://lore.kernel.org/r/603735bc50eb370bc7f1c358441ac671360bab25.1740501868.git.lukas@wunner.de Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas --- include/linux/pci_hotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index 3a10d6ec3ee7..ec77ccf1fc4d 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -50,7 +50,6 @@ struct hotplug_slot_ops { /** * struct hotplug_slot - used to register a physical slot with the hotplug pci core * @ops: pointer to the &struct hotplug_slot_ops to be used for this slot - * @slot_list: internal list used to track hotplug PCI slots * @pci_slot: represents a physical slot * @owner: The module owner of this structure * @mod_name: The module name (KBUILD_MODNAME) of this structure @@ -59,7 +58,6 @@ struct hotplug_slot { const struct hotplug_slot_ops *ops; /* Variables below this are for use only by the hotplug pci core. */ - struct list_head slot_list; struct pci_slot *pci_slot; struct module *owner; const char *mod_name; -- cgit v1.2.3 From cc973ef13f8e31608d34c349c6ed85d44d716523 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 25 Feb 2025 18:06:05 +0100 Subject: PCI: hotplug: Inline pci_hp_{create,remove}_module_link() For no apparent reason, the pci_hp_{create,remove}_module_link() helpers live in slot.c, even though they're only called from two functions in pci_hotplug_core.c. Inline the helpers to reduce code size and number of exported symbols. Link: https://lore.kernel.org/r/c207f03cfe32ae9002d9b453001a1dd63d9ab3fb.1740501868.git.lukas@wunner.de Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 47b31ad724fa..a0f5c8fcd9c7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2447,11 +2447,6 @@ static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int res static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { } #endif -#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE) -void pci_hp_create_module_link(struct pci_slot *pci_slot); -void pci_hp_remove_module_link(struct pci_slot *pci_slot); -#endif - /** * pci_pcie_cap - get the saved PCIe capability offset * @dev: PCI device -- cgit v1.2.3 From 95d0d094ba26432ec467e2260f4bf553053f1f8f Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Sat, 1 Mar 2025 21:55:16 +0800 Subject: ppp: use IFF_NO_QUEUE in virtual interfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For PPPoE, PPTP, and PPPoL2TP, the start_xmit() function directly forwards packets to the underlying network stack and never returns anything other than 1. So these interfaces do not require a qdisc, and the IFF_NO_QUEUE flag should be set. Introduces a direct_xmit flag in struct ppp_channel to indicate when IFF_NO_QUEUE should be applied. The flag is set in ppp_connect_channel() for relevant protocols. While at it, remove the usused latency member from struct ppp_channel. Signed-off-by: Qingfang Deng Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250301135517.695809-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ppp_channel.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h index 45e6e427ceb8..f73fbea0dbc2 100644 --- a/include/linux/ppp_channel.h +++ b/include/linux/ppp_channel.h @@ -42,8 +42,7 @@ struct ppp_channel { int hdrlen; /* amount of headroom channel needs */ void *ppp; /* opaque to channel */ int speed; /* transfer rate (bytes/second) */ - /* the following is not used at present */ - int latency; /* overhead time in milliseconds */ + bool direct_xmit; /* no qdisc, xmit directly */ }; #ifdef __KERNEL__ -- cgit v1.2.3 From 23c22d91561dd555d07381fb1f567539769c2ea0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 Dec 2024 13:09:30 -0800 Subject: rcu-tasks: Move RCU Tasks self-tests to core_initcall() The timer and hrtimer softirq processing has moved to dedicated threads for kernels built with CONFIG_IRQ_FORCED_THREADING=y. This results in timers not expiring until later in early boot, which in turn causes the RCU Tasks self-tests to hang in kernels built with CONFIG_PROVE_RCU=y, which further causes the entire kernel to hang. One fix would be to make timers work during this time, but there are no known users of RCU Tasks grace periods during that time, so no justification for the added complexity. Not yet, anyway. This commit therefore moves the call to rcu_init_tasks_generic() from kernel_init_freeable() to a core_initcall(). This works because the timer and hrtimer kthreads are created at early_initcall() time. Fixes: 49a17639508c3 ("softirq: Use a dedicated thread for timer wakeups on PREEMPT_RT.") Signed-off-by: Paul E. McKenney Cc: Sebastian Andrzej Siewior Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Masami Hiramatsu Cc: Tested-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Boqun Feng --- include/linux/rcupdate.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..36849a4ea141 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -121,12 +121,6 @@ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -#ifdef CONFIG_TASKS_RCU_GENERIC -void rcu_init_tasks_generic(void); -#else -static inline void rcu_init_tasks_generic(void) { } -#endif - #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); -- cgit v1.2.3 From 6ea9a1781c70a8be1fcdc49134fc1bf4baba8bca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Feb 2025 10:33:28 -0800 Subject: Flush console log from kernel_power_off() Kernels built with CONFIG_PREEMPT_RT=y can lose significant console output and shutdown time, which hides shutdown-time RCU issues from rcutorture. Therefore, make pr_flush() public and invoke it after then last print in kernel_power_off(). [ paulmck: Apply John Ogness feedback. ] [ paulmck: Appy Sebastian Andrzej Siewior feedback. ] [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Reviewed-by: John Ogness Reviewed-by: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Link: https://lore.kernel.org/r/5f743488-dc2a-4f19-bdda-cf50b9314832@paulmck-laptop Signed-off-by: Boqun Feng --- include/linux/printk.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 4217a9f412b2..5b462029d03c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -207,6 +207,7 @@ void printk_legacy_allow_panic_sync(void); extern bool nbcon_device_try_acquire(struct console *con); extern void nbcon_device_release(struct console *con); void nbcon_atomic_flush_unsafe(void); +bool pr_flush(int timeout_ms, bool reset_on_progress); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -315,6 +316,11 @@ static inline void nbcon_atomic_flush_unsafe(void) { } +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return true; +} + #endif bool this_cpu_in_panic(void); -- cgit v1.2.3 From 5a562b8b3f5de4c50f4a9da92bfd3f0a6eebf081 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 27 Feb 2025 14:16:13 +0100 Subject: rcu: Use _full() API to debug synchronize_rcu() Switch for using of get_state_synchronize_rcu_full() and poll_state_synchronize_rcu_full() pair to debug a normal synchronize_rcu() call. Just using "not" full APIs to identify if a grace period is passed or not might lead to a false-positive kernel splat. It can happen, because get_state_synchronize_rcu() compresses both normal and expedited states into one single unsigned long value, so a poll_state_synchronize_rcu() can miss GP-completion when synchronize_rcu()/synchronize_rcu_expedited() concurrently run. To address this, switch to poll_state_synchronize_rcu_full() and get_state_synchronize_rcu_full() APIs, which use separate variables for expedited and normal states. Reported-by: cheung wall Closes: https://lore.kernel.org/lkml/Z5ikQeVmVdsWQrdD@pc636/T/ Fixes: 988f569ae041 ("rcu: Reduce synchronize_rcu() latency") Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Paul E. McKenney Link: https://lore.kernel.org/r/20250227131613.52683-3-urezki@gmail.com Signed-off-by: Boqun Feng --- include/linux/rcupdate_wait.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index f9bed3d3f78d..4c92d4291cce 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -16,6 +16,9 @@ struct rcu_synchronize { struct rcu_head head; struct completion completion; + + /* This is for debugging. */ + struct rcu_gp_oldstate oldstate; }; void wakeme_after_rcu(struct rcu_head *head); -- cgit v1.2.3 From 939c5de3c70d145d7388db1b04d75cda79297c23 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 1 Mar 2025 16:37:20 +0800 Subject: mm/slab: call kmalloc_noprof() unconditionally in kmalloc_array_noprof() If 'n' or 'size' isn't builtin constant, we used to call __kmalloc() before commit 7bd230a26648 ("mm/slab: enable slab allocation tagging for kmalloc and friends"), which inadvertedly changed both paths to kmalloc_noprof(). As Harry Yoo points out we can just call kmalloc_noprof() unconditionally. If the compiler knows n and size are constants it doesn't guarantee that bytes will be also seen as constant, and that is the important test in kmalloc_noprof() anyway, so we can just defer to it always. [ vbabka@suse.cz: change as Harry suggested and adjust commit log ] Fixes: 7bd230a26648 ("mm/slab: enable slab allocation tagging for kmalloc and friends") Signed-off-by: Ye Bin Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..ab05a143d09a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -941,8 +941,6 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (__builtin_constant_p(n) && __builtin_constant_p(size)) - return kmalloc_noprof(bytes, flags); return kmalloc_noprof(bytes, flags); } #define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__)) -- cgit v1.2.3 From 8376583b84a1937196514dbe380917d61af29978 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 27 Feb 2025 12:32:57 +1100 Subject: nfs: change mkdir inode_operation to return alternate dentry if needed. mkdir now allows a different dentry to be returned which is sometimes relevant for nfs. This patch changes the nfs_rpc_ops mkdir op to return a dentry, and passes that back to the caller. The mkdir nfs_rpc_op will return NULL if the original dentry should be used. This matches the mkdir inode_operation. nfs4_do_create() is duplicated to nfs4_do_mkdir() which is changed to handle the specifics of directories. Consequently the current special handling for directories is removed from nfs4_do_create() Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250227013949.536172-6-neilb@suse.de Signed-off-by: Christian Brauner --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9155a6ffc370..d66c61cbbd1d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1802,7 +1802,7 @@ struct nfs_rpc_ops { int (*link) (struct inode *, struct inode *, const struct qstr *); int (*symlink) (struct inode *, struct dentry *, struct folio *, unsigned int, struct iattr *); - int (*mkdir) (struct inode *, struct dentry *, struct iattr *); + struct dentry *(*mkdir) (struct inode *, struct dentry *, struct iattr *); int (*rmdir) (struct inode *, const struct qstr *); int (*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *); int (*mknod) (struct inode *, struct dentry *, struct iattr *, -- cgit v1.2.3 From c54b386969a58151765a9ffaaa0438e7b580283f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 27 Feb 2025 12:32:58 +1100 Subject: VFS: Change vfs_mkdir() to return the dentry. vfs_mkdir() does not guarantee to leave the child dentry hashed or make it positive on success, and in many such cases the filesystem had to use a different dentry which it can now return. This patch changes vfs_mkdir() to return the dentry provided by the filesystems which is hashed and positive when provided. This reduces the number of cases where the resulting dentry is not positive to a handful which don't deserve extra efforts. The only callers of vfs_mkdir() which are interested in the resulting inode are in-kernel filesystem clients: cachefiles, nfsd, smb/server. The only filesystems that don't reliably provide the inode are: - kernfs, tracefs which these clients are unlikely to be interested in - cifs in some configurations would need to do a lookup to find the created inode, but doesn't. cifs cannot be exported via NFS, is unlikely to be used by cachefiles, and smb/server only has a soft requirement for the inode, so this is unlikely to be a problem in practice. - hostfs, nfs, cifs may need to do a lookup (rarely for NFS) and it is possible for a race to make that lookup fail. Actual failure is unlikely and providing callers handle negative dentries graceful they will fail-safe. So this patch removes the lookup code in nfsd and smb/server and adjusts them to fail safe if a negative dentry is provided: - cache-files already fails safe by restarting the task from the top - it still does with this change, though it no longer calls cachefiles_put_directory() as that will crash if the dentry is negative. - nfsd reports "Server-fault" which it what it used to do if the lookup failed. This will never happen on any file-systems that it can actually export, so this is of no consequence. I removed the fh_update() call as that is not needed and out-of-place. A subsequent nfsd_create_setattr() call will call fh_update() when needed. - smb/server only wants the inode to call ksmbd_smb_inherit_owner() which updates ->i_uid (without calling notify_change() or similar) which can be safely skipping on cifs (I hope). If a different dentry is returned, the first one is put. If necessary the fact that it is new can be determined by comparing pointers. A new dentry will certainly have a new pointer (as the old is put after the new is obtained). Similarly if an error is returned (via ERR_PTR()) the original dentry is put. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250227013949.536172-7-neilb@suse.de Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 45269608ee23..beae24bc990d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1981,8 +1981,8 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, */ int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int vfs_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); +struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, + struct dentry *, umode_t); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int vfs_symlink(struct mnt_idmap *, struct inode *, -- cgit v1.2.3 From 59b59a943177e1a96d645a63d1ad824644fa848a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 Mar 2025 17:02:23 +0000 Subject: fscrypt: Change fscrypt_encrypt_pagecache_blocks() to take a folio ext4 and ceph already have a folio to pass; f2fs needs to be properly converted but this will do for now. This removes a reference to page->index and page->mapping as well as removing a call to compound_head(). Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250304170224.523141-1-willy@infradead.org Acked-by: Eric Biggers Signed-off-by: Christian Brauner --- include/linux/fscrypt.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 18855cb44b1c..56fad33043d5 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -310,10 +310,8 @@ static inline void fscrypt_prepare_dentry(struct dentry *dentry, /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); -struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags); +struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs, gfp_t gfp_flags); int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); @@ -480,10 +478,8 @@ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } -static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags) +static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs, gfp_t gfp_flags) { return ERR_PTR(-EOPNOTSUPP); } -- cgit v1.2.3 From 4513522984a0af9c170af991c37fbb483cca654b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Mar 2025 11:08:15 +0100 Subject: pidfs: record exit code and cgroupid at exit Record the exit code and cgroupid in release_task() and stash in struct pidfs_exit_info so it can be retrieved even after the task has been reaped. Link: https://lore.kernel.org/r/20250305-work-pidfs-kill_on_last_close-v3-5-c8c3d8361705@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 7c830d0dec9a..05e6f8f4a026 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -6,6 +6,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); +void pidfs_exit(struct task_struct *tsk); extern const struct dentry_operations pidfs_dentry_operations; #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From e859d375d1694488015e6804bfeea527a0b25b9f Mon Sep 17 00:00:00 2001 From: Wojtek Wasko Date: Mon, 3 Mar 2025 18:13:43 +0200 Subject: posix-clock: Store file pointer in struct posix_clock_context File descriptor based pc_clock_*() operations of dynamic posix clocks have access to the file pointer and implement permission checks in the generic code before invoking the relevant dynamic clock callback. Character device operations (open, read, poll, ioctl) do not implement a generic permission control and the dynamic clock callbacks have no access to the file pointer to implement them. Extend struct posix_clock_context with a struct file pointer and initialize it in posix_clock_open(), so that all dynamic clock callbacks can access it. Acked-by: Richard Cochran Reviewed-by: Vadim Fedorenko Reviewed-by: Thomas Gleixner Signed-off-by: Wojtek Wasko Signed-off-by: David S. Miller --- include/linux/posix-clock.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h index ef8619f48920..a500d3160fe8 100644 --- a/include/linux/posix-clock.h +++ b/include/linux/posix-clock.h @@ -95,10 +95,13 @@ struct posix_clock { * struct posix_clock_context - represents clock file operations context * * @clk: Pointer to the clock + * @fp: Pointer to the file used to open the clock * @private_clkdata: Pointer to user data * * Drivers should use struct posix_clock_context during specific character - * device file operation methods to access the posix clock. + * device file operation methods to access the posix clock. In particular, + * the file pointer can be used to verify correct access mode for ioctl() + * calls. * * Drivers can store a private data structure during the open operation * if they have specific information that is required in other file @@ -106,6 +109,7 @@ struct posix_clock { */ struct posix_clock_context { struct posix_clock *clk; + struct file *fp; void *private_clkdata; }; -- cgit v1.2.3 From f636d4f60ac477187a466a573f947731fa762059 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Wed, 5 Mar 2025 15:13:00 +0200 Subject: gpio: Add a valid_mask getter The valid_mask member of the struct gpio_chip is unconditionally written by the GPIO core at driver registration. It shouldn't be directly populated by drivers. This can be prevented by moving it from the struct gpio_chip to struct gpio_device, which is internal to the GPIO core. As a preparatory step, provide a getter function which can be used by those drivers which need the valid_mask information. Signed-off-by: Matti Vaittinen Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/026f9d78502eca883bfe3faeb684e23d5d6c5e84.1741180097.git.mazziesaccount@gmail.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 83e0a7e86962..e3b59fda62e0 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -720,6 +720,7 @@ bool gpiochip_line_is_open_source(struct gpio_chip *gc, unsigned int offset); /* Sleep persistence inquiry for drivers */ bool gpiochip_line_is_persistent(struct gpio_chip *gc, unsigned int offset); bool gpiochip_line_is_valid(const struct gpio_chip *gc, unsigned int offset); +const unsigned long *gpiochip_query_valid_mask(const struct gpio_chip *gc); /* get driver data */ void *gpiochip_get_data(struct gpio_chip *gc); -- cgit v1.2.3 From 8015443e24e76fac97268603e91c4793970ce657 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Wed, 5 Mar 2025 15:13:25 +0200 Subject: gpio: Hide valid_mask from direct assignments The valid_mask member of the struct gpio_chip is unconditionally written by the GPIO core at driver registration. Current documentation does not mention this but just says the valid_mask is used if it's not NULL. This lured me to try populating it directly in the GPIO driver probe instead of using the init_valid_mask() callback. It took some retries with different bitmaps and eventually a bit of code-reading to understand why the valid_mask was not obeyed. I could've avoided this trial and error if the valid_mask was hidden in the struct gpio_device instead of being a visible member of the struct gpio_chip. Help the next developer who decides to directly populate the valid_mask in struct gpio_chip by hiding the valid_mask in struct gpio_device and keep it internal to the GPIO core. Suggested-by: Linus Walleij Signed-off-by: Matti Vaittinen Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/4547ca90d910d60cab3d56d864d59ddde47a5e93.1741180097.git.mazziesaccount@gmail.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index e3b59fda62e0..e6e5304c99ca 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -514,14 +514,6 @@ struct gpio_chip { struct gpio_irq_chip irq; #endif /* CONFIG_GPIOLIB_IRQCHIP */ - /** - * @valid_mask: - * - * If not %NULL, holds bitmask of GPIOs which are valid to be used - * from the chip. - */ - unsigned long *valid_mask; - #if defined(CONFIG_OF_GPIO) /* * If CONFIG_OF_GPIO is enabled, then all GPIO controllers described in -- cgit v1.2.3 From 0312e94abe484b9ee58c32d2f8ba177e04955b35 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 5 Mar 2025 23:59:32 +0900 Subject: treewide: fix typo 'unsigned __init128' -> 'unsigned __int128' "int" was misspelled as "init" the code comments in the bits.h and const.h files. Fix the typo. CC: Andy Shevchenko Signed-off-by: Vincent Mailhol Signed-off-by: Yury Norov --- include/linux/bits.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bits.h b/include/linux/bits.h index 61a75d3f294b..14fd0ca9a6cd 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -40,7 +40,7 @@ * Missing asm support * * __GENMASK_U128() depends on _BIT128() which would not work - * in the asm code, as it shifts an 'unsigned __init128' data + * in the asm code, as it shifts an 'unsigned __int128' data * type instead of direct representation of 128 bit constants * such as long and unsigned long. The fundamental problem is * that a 128 bit constant will get silently truncated by the -- cgit v1.2.3 From c27c66afc449b80f3b4b84d123358c0248f2cf63 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Mar 2025 07:08:09 -1000 Subject: fs/pipe: Fix pipe_occupancy() with 16-bit indexes The pipe_occupancy() logic implicitly relied on the natural unsigned modulo arithmetic in C, but that doesn't work for the new 'pipe_index_t' case, since any arithmetic will be done in 'int' (and here we had also made it 'unsigned int' due to the function call boundary). So make the modulo arithmetic explicit by casting the result to the proper type. Cc: Oleg Nesterov Cc: Mateusz Guzik Cc: Manfred Spraul Cc: Christian Brauner Cc: Swapnil Sapkal Cc: Alexey Gladkov Cc: K Prateek Nayak Link: https://lore.kernel.org/all/CAHk-=wjyHsGLx=rxg6PKYBNkPYAejgo7=CbyL3=HGLZLsAaJFQ@mail.gmail.com/ Fixes: 3d252160b818 ("fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex") Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 3cc4f8eab853..1f013ed7577e 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -192,7 +192,7 @@ static inline bool pipe_empty(unsigned int head, unsigned int tail) */ static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { - return head - tail; + return (pipe_index_t)(head - tail); } /** -- cgit v1.2.3 From cfced12f5100e50d56bc587299393fd33c1169a9 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 5 Mar 2025 11:23:01 +0000 Subject: include/linux/pipe_fs_i: Add htmldoc annotation for "head_tail" member Add htmldoc annotation for the newly introduced "head_tail" member describing it to be a union of the pipe_inode_info's @head and @tail members. Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20250305204609.5e64768e@canb.auug.org.au/ Fixes: 3d252160b818 ("fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex") Signed-off-by: K Prateek Nayak Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 1f013ed7577e..05ccbc5d0129 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -65,6 +65,7 @@ union pipe_index { * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption + * @head_tail: unsigned long union of @head and @tail * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) -- cgit v1.2.3 From e83588458f656417d9b7ac45baf1c7b45790059b Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Mar 2025 13:36:41 +0100 Subject: file: add fput and file_ref_put routines optimized for use when closing a fd Vast majority of the time closing a file descriptor also operates on the last reference, where a regular fput usage will result in 2 atomics. This can be changed to only suffer 1. See commentary above file_ref_put_close() for more information. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250305123644.554845-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/file_ref.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h index 9b3a8d9b17ab..6ef92d765a66 100644 --- a/include/linux/file_ref.h +++ b/include/linux/file_ref.h @@ -61,6 +61,7 @@ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) atomic_long_set(&ref->refcnt, cnt - 1); } +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt); bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** @@ -160,6 +161,39 @@ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) return __file_ref_put(ref, cnt); } +/** + * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF + * @ref: Pointer to the reference count + * + * Semantically it is equivalent to calling file_ref_put(), but it trades lower + * performance in face of other CPUs also modifying the refcount for higher + * performance when this happens to be the last reference. + * + * For the last reference file_ref_put() issues 2 atomics. One to drop the + * reference and another to transition it to FILE_REF_DEAD. This routine does + * the work in one step, but in order to do it has to pre-read the variable which + * decreases scalability. + * + * Use with close() et al, stick to file_ref_put() by default. + */ +static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) +{ + long old, new; + + old = atomic_long_read(&ref->refcnt); + do { + if (unlikely(old < 0)) + return __file_ref_put_badval(ref, old); + + if (old == FILE_REF_ONEREF) + new = FILE_REF_DEAD; + else + new = old - 1; + } while (!atomic_long_try_cmpxchg(&ref->refcnt, &old, new)); + + return new == FILE_REF_DEAD; +} + /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count -- cgit v1.2.3 From 0d2d0f3d93ddd6556f23c917d910becd9925ddeb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Mar 2025 07:35:40 -1000 Subject: fs/pipe: remove buggy and unused 'helper' function While looking for incorrect users of the pipe head/tail fields (see commit c27c66afc449: "fs/pipe: Fix pipe_occupancy() with 16-bit indexes"), I found a bug in pipe_discard_from() that looked entirely broken. However, the fix is trivial: this buggy function isn't actually called by anything, so let's just remove it ASAP. Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 05ccbc5d0129..e572e6fc4f81 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -281,15 +281,6 @@ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, return buf->ops->try_steal(pipe, buf); } -static inline void pipe_discard_from(struct pipe_inode_info *pipe, - unsigned int old_head) -{ - unsigned int mask = pipe->ring_size - 1; - - while (pipe->head > old_head) - pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]); -} - /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ #define PIPE_SIZE PAGE_SIZE -- cgit v1.2.3 From 7e2f7e25f6ffc229fd5be0488e1d8aac7bdd80e8 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 27 Feb 2025 20:15:19 +0800 Subject: arch: x86: add IPC mailbox accessor function and add SoC register access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Exports intel_pmc_ipc() for host access to the PMC IPC mailbox - Enables the host to access specific SoC registers through the PMC firmware using IPC commands. This access method is necessary for registers that are not available through direct Memory-Mapped I/O (MMIO), which is used for other accessible parts of the PMC. Signed-off-by: David E. Box Signed-off-by: Chao Qin Signed-off-by: Choong Yong Liang Acked-by: Ilpo Järvinen Link: https://patch.msgid.link/20250227121522.1802832-4-yong.liang.choong@linux.intel.com Signed-off-by: Jakub Kicinski --- include/linux/platform_data/x86/intel_pmc_ipc.h | 94 +++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 include/linux/platform_data/x86/intel_pmc_ipc.h (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/intel_pmc_ipc.h b/include/linux/platform_data/x86/intel_pmc_ipc.h new file mode 100644 index 000000000000..6e603a8c075f --- /dev/null +++ b/include/linux/platform_data/x86/intel_pmc_ipc.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Intel Core SoC Power Management Controller Header File + * + * Copyright (c) 2025, Intel Corporation. + * All Rights Reserved. + * + */ +#ifndef INTEL_PMC_IPC_H +#define INTEL_PMC_IPC_H +#include + +#define IPC_SOC_REGISTER_ACCESS 0xAA +#define IPC_SOC_SUB_CMD_READ 0x00 +#define IPC_SOC_SUB_CMD_WRITE 0x01 +#define PMC_IPCS_PARAM_COUNT 7 +#define VALID_IPC_RESPONSE 5 + +struct pmc_ipc_cmd { + u32 cmd; + u32 sub_cmd; + u32 size; + u32 wbuf[4]; +}; + +struct pmc_ipc_rbuf { + u32 buf[4]; +}; + +/** + * intel_pmc_ipc() - PMC IPC Mailbox accessor + * @ipc_cmd: Prepared input command to send + * @rbuf: Allocated array for returned IPC data + * + * Return: 0 on success. Non-zero on mailbox error + */ +static inline int intel_pmc_ipc(struct pmc_ipc_cmd *ipc_cmd, struct pmc_ipc_rbuf *rbuf) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object params[PMC_IPCS_PARAM_COUNT] = { + {.type = ACPI_TYPE_INTEGER,}, + {.type = ACPI_TYPE_INTEGER,}, + {.type = ACPI_TYPE_INTEGER,}, + {.type = ACPI_TYPE_INTEGER,}, + {.type = ACPI_TYPE_INTEGER,}, + {.type = ACPI_TYPE_INTEGER,}, + {.type = ACPI_TYPE_INTEGER,}, + }; + struct acpi_object_list arg_list = { PMC_IPCS_PARAM_COUNT, params }; + union acpi_object *obj; + int status; + + if (!ipc_cmd || !rbuf) + return -EINVAL; + + /* + * 0: IPC Command + * 1: IPC Sub Command + * 2: Size + * 3-6: Write Buffer for offset + */ + params[0].integer.value = ipc_cmd->cmd; + params[1].integer.value = ipc_cmd->sub_cmd; + params[2].integer.value = ipc_cmd->size; + params[3].integer.value = ipc_cmd->wbuf[0]; + params[4].integer.value = ipc_cmd->wbuf[1]; + params[5].integer.value = ipc_cmd->wbuf[2]; + params[6].integer.value = ipc_cmd->wbuf[3]; + + status = acpi_evaluate_object(NULL, "\\IPCS", &arg_list, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + obj = buffer.pointer; + + if (obj && obj->type == ACPI_TYPE_PACKAGE && + obj->package.count == VALID_IPC_RESPONSE) { + const union acpi_object *objs = obj->package.elements; + + if ((u8)objs[0].integer.value != 0) + return -EINVAL; + + rbuf->buf[0] = objs[1].integer.value; + rbuf->buf[1] = objs[2].integer.value; + rbuf->buf[2] = objs[3].integer.value; + rbuf->buf[3] = objs[4].integer.value; + } else { + return -EINVAL; + } + + return 0; +} + +#endif /* INTEL_PMC_IPC_H */ -- cgit v1.2.3 From e654cfc718d451bdf6c1554efc945171239f03f5 Mon Sep 17 00:00:00 2001 From: Choong Yong Liang Date: Thu, 27 Feb 2025 20:15:20 +0800 Subject: net: stmmac: configure SerDes on mac_finish SerDes will configure according to the provided interface mode after finish a major reconfiguration of the interface mode. Reviewed-by: Russell King (Oracle) Signed-off-by: Choong Yong Liang Link: https://patch.msgid.link/20250227121522.1802832-5-yong.liang.choong@linux.intel.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index cd0d1383df87..b6f03ab12595 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -239,6 +239,10 @@ struct plat_stmmacenet_data { int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); void (*speed_mode_2500)(struct net_device *ndev, void *priv); + int (*mac_finish)(struct net_device *ndev, + void *priv, + unsigned int mode, + phy_interface_t interface); void (*ptp_clk_freq_config)(struct stmmac_priv *priv); int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); -- cgit v1.2.3 From e7f984e925d23f8fd0469e344a8dc225e1a1b0ab Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 3 Mar 2025 21:18:46 +0100 Subject: net: phy: move PHY package related code from phy.h to phy_package.c Move PHY package related inline functions from phy.h to phy_package.c. While doing so remove locked versions phy_package_read() and phy_package_write() which have no user. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/a4518379-7a5d-45f3-831c-b7fde6512c65@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 86 ----------------------------------------------------- 1 file changed, 86 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7bfbae51070a..2b12d1bef3cc 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -350,10 +350,6 @@ struct phy_package_shared { void *priv; }; -/* used as bit number in atomic bitops */ -#define PHY_SHARED_F_INIT_DONE 0 -#define PHY_SHARED_F_PROBE_DONE 1 - /** * struct mii_bus - Represents an MDIO bus * @@ -2149,67 +2145,6 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); -static inline int phy_package_address(struct phy_device *phydev, - unsigned int addr_offset) -{ - struct phy_package_shared *shared = phydev->shared; - u8 base_addr = shared->base_addr; - - if (addr_offset >= PHY_MAX_ADDR - base_addr) - return -EIO; - - /* we know that addr will be in the range 0..31 and thus the - * implicit cast to a signed int is not a problem. - */ - return base_addr + addr_offset; -} - -static inline int phy_package_read(struct phy_device *phydev, - unsigned int addr_offset, u32 regnum) -{ - int addr = phy_package_address(phydev, addr_offset); - - if (addr < 0) - return addr; - - return mdiobus_read(phydev->mdio.bus, addr, regnum); -} - -static inline int __phy_package_read(struct phy_device *phydev, - unsigned int addr_offset, u32 regnum) -{ - int addr = phy_package_address(phydev, addr_offset); - - if (addr < 0) - return addr; - - return __mdiobus_read(phydev->mdio.bus, addr, regnum); -} - -static inline int phy_package_write(struct phy_device *phydev, - unsigned int addr_offset, u32 regnum, - u16 val) -{ - int addr = phy_package_address(phydev, addr_offset); - - if (addr < 0) - return addr; - - return mdiobus_write(phydev->mdio.bus, addr, regnum, val); -} - -static inline int __phy_package_write(struct phy_device *phydev, - unsigned int addr_offset, u32 regnum, - u16 val) -{ - int addr = phy_package_address(phydev, addr_offset); - - if (addr < 0) - return addr; - - return __mdiobus_write(phydev->mdio.bus, addr, regnum, val); -} - int __phy_package_read_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum); @@ -2226,27 +2161,6 @@ int phy_package_write_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum, u16 val); -static inline bool __phy_package_set_once(struct phy_device *phydev, - unsigned int b) -{ - struct phy_package_shared *shared = phydev->shared; - - if (!shared) - return false; - - return !test_and_set_bit(b, &shared->flags); -} - -static inline bool phy_package_init_once(struct phy_device *phydev) -{ - return __phy_package_set_once(phydev, PHY_SHARED_F_INIT_DONE); -} - -static inline bool phy_package_probe_once(struct phy_device *phydev) -{ - return __phy_package_set_once(phydev, PHY_SHARED_F_PROBE_DONE); -} - extern const struct bus_type mdio_bus_type; struct mdio_board_info { -- cgit v1.2.3 From a4002849776948f257d564944e8fd7a727362ed9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 3 Mar 2025 21:19:25 +0100 Subject: net: phy: remove remaining PHY package related definitions from phy.h Move definition of struct phy_package_shared to phy_package.c, and move remaining PHY package related declarations from phy.h to phylib.h, thus making them accessible for PHY drivers only. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/211e14b6-e2f8-43d7-b533-3628ec548456@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 38 -------------------------------------- 1 file changed, 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2b12d1bef3cc..c4a6385faf41 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -319,37 +319,6 @@ struct mdio_bus_stats { struct u64_stats_sync syncp; }; -/** - * struct phy_package_shared - Shared information in PHY packages - * @base_addr: Base PHY address of PHY package used to combine PHYs - * in one package and for offset calculation of phy_package_read/write - * @np: Pointer to the Device Node if PHY package defined in DT - * @refcnt: Number of PHYs connected to this shared data - * @flags: Initialization of PHY package - * @priv_size: Size of the shared private data @priv - * @priv: Driver private data shared across a PHY package - * - * Represents a shared structure between different phydev's in the same - * package, for example a quad PHY. See phy_package_join() and - * phy_package_leave(). - */ -struct phy_package_shared { - u8 base_addr; - /* With PHY package defined in DT this points to the PHY package node */ - struct device_node *np; - refcount_t refcnt; - unsigned long flags; - size_t priv_size; - - /* private data pointer */ - /* note that this pointer is shared between different phydevs and - * the user has to take care of appropriate locking. It is allocated - * and freed automatically by phy_package_join() and - * phy_package_leave(). - */ - void *priv; -}; - /** * struct mii_bus - Represents an MDIO bus * @@ -2109,13 +2078,6 @@ int phy_ethtool_get_link_ksettings(struct net_device *ndev, int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); -int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size); -int of_phy_package_join(struct phy_device *phydev, size_t priv_size); -void phy_package_leave(struct phy_device *phydev); -int devm_phy_package_join(struct device *dev, struct phy_device *phydev, - int base_addr, size_t priv_size); -int devm_of_phy_package_join(struct device *dev, struct phy_device *phydev, - size_t priv_size); int __init mdio_bus_init(void); void mdio_bus_exit(void); -- cgit v1.2.3 From 67bab13307c83fb742c2556b06cdc39dbad27f07 Mon Sep 17 00:00:00 2001 From: Ge Yang Date: Wed, 19 Feb 2025 11:46:44 +0800 Subject: mm/hugetlb: wait for hugetlb folios to be freed Since the introduction of commit c77c0a8ac4c52 ("mm/hugetlb: defer freeing of huge pages if in non-task context"), which supports deferring the freeing of hugetlb pages, the allocation of contiguous memory through cma_alloc() may fail probabilistically. In the CMA allocation process, if it is found that the CMA area is occupied by in-use hugetlb folios, these in-use hugetlb folios need to be migrated to another location. When there are no available hugetlb folios in the free hugetlb pool during the migration of in-use hugetlb folios, new folios are allocated from the buddy system. A temporary state is set on the newly allocated folio. Upon completion of the hugetlb folio migration, the temporary state is transferred from the new folios to the old folios. Normally, when the old folios with the temporary state are freed, it is directly released back to the buddy system. However, due to the deferred freeing of hugetlb pages, the PageBuddy() check fails, ultimately leading to the failure of cma_alloc(). Here is a simplified call trace illustrating the process: cma_alloc() ->__alloc_contig_migrate_range() // Migrate in-use hugetlb folios ->unmap_and_move_huge_page() ->folio_putback_hugetlb() // Free old folios ->test_pages_isolated() ->__test_page_isolated_in_pageblock() ->PageBuddy(page) // Check if the page is in buddy To resolve this issue, we have implemented a function named wait_for_freed_hugetlb_folios(). This function ensures that the hugetlb folios are properly released back to the buddy system after their migration is completed. By invoking wait_for_freed_hugetlb_folios() before calling PageBuddy(), we ensure that PageBuddy() will succeed. Link: https://lkml.kernel.org/r/1739936804-18199-1-git-send-email-yangge1116@126.com Fixes: c77c0a8ac4c5 ("mm/hugetlb: defer freeing of huge pages if in non-task context") Signed-off-by: Ge Yang Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec8c0ccc8f95..dbe76d4f1bfc 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -682,6 +682,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); +void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, @@ -1066,6 +1067,10 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn, return 0; } +static inline void wait_for_freed_hugetlb_folios(void) +{ +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner) -- cgit v1.2.3 From ce6d9c1c2b5cc785016faa11b48b6cd317eb367e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 24 Feb 2025 21:20:02 -0500 Subject: NFS: fix nfs_release_folio() to not deadlock via kcompactd writeback Add PF_KCOMPACTD flag and current_is_kcompactd() helper to check for it so nfs_release_folio() can skip calling nfs_wb_folio() from kcompactd. Otherwise NFS can deadlock waiting for kcompactd enduced writeback which recurses back to NFS (which triggers writeback to NFSD via NFS loopback mount on the same host, NFSD blocks waiting for XFS's call to __filemap_get_folio): 6070.550357] INFO: task kcompactd0:58 blocked for more than 4435 seconds. {--- [58] "kcompactd0" [<0>] folio_wait_bit+0xe8/0x200 [<0>] folio_wait_writeback+0x2b/0x80 [<0>] nfs_wb_folio+0x80/0x1b0 [nfs] [<0>] nfs_release_folio+0x68/0x130 [nfs] [<0>] split_huge_page_to_list_to_order+0x362/0x840 [<0>] migrate_pages_batch+0x43d/0xb90 [<0>] migrate_pages_sync+0x9a/0x240 [<0>] migrate_pages+0x93c/0x9f0 [<0>] compact_zone+0x8e2/0x1030 [<0>] compact_node+0xdb/0x120 [<0>] kcompactd+0x121/0x2e0 [<0>] kthread+0xcf/0x100 [<0>] ret_from_fork+0x31/0x40 [<0>] ret_from_fork_asm+0x1a/0x30 ---} [akpm@linux-foundation.org: fix build] Link: https://lkml.kernel.org/r/20250225022002.26141-1-snitzer@kernel.org Fixes: 96780ca55e3c ("NFS: fix up nfs_release_folio() to try to release the page") Signed-off-by: Mike Snitzer Cc: Anna Schumaker Cc: Trond Myklebust Cc: Signed-off-by: Andrew Morton --- include/linux/compaction.h | 5 +++++ include/linux/sched.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index e94776496049..7bf0c521db63 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -80,6 +80,11 @@ static inline unsigned long compact_gap(unsigned int order) return 2UL << order; } +static inline int current_is_kcompactd(void) +{ + return current->flags & PF_KCOMPACTD; +} + #ifdef CONFIG_COMPACTION extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order); diff --git a/include/linux/sched.h b/include/linux/sched.h index 9632e3318e0d..9c15365a30c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1701,7 +1701,7 @@ extern struct pid *cad_pid; #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ -#define PF__HOLE__00010000 0x00010000 +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ -- cgit v1.2.3 From c29564d8b46f64f5e6e6f1c9c02f7761b7b90963 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Fri, 21 Feb 2025 15:16:25 +0800 Subject: include/linux/log2.h: mark is_power_of_2() with __always_inline When building kernel with randconfig, there is an error: In function `kvm_is_cr4_bit_set',inlined from `kvm_update_cpuid_runtime' at arch/x86/kvm/cpuid.c:310:9: include/linux/compiler_types.h:542:38: error: call to `__compiletime_assert_380' declared with attribute error: BUILD_BUG_ON failed: !is_power_of_2(cr4_bit). '!is_power_of_2(X86_CR4_OSXSAVE)' is False, but gcc treats is_power_of_2() as non-inline function and a compilation error happens. Fix this by marking is_power_of_2() with __always_inline. Link: https://lkml.kernel.org/r/20250221071624.1356899-1-suhui@nfschina.com Signed-off-by: Su Hui Cc: Binbin Wu Cc: Paolo Bonzini Cc: Sean Christopherson Signed-off-by: Andrew Morton --- include/linux/log2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/log2.h b/include/linux/log2.h index 9f30d087a128..1366cb688a6d 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -41,7 +41,7 @@ int __ilog2_u64(u64 n) * *not* considered a power of two. * Return: true if @n is a power of 2, otherwise false. */ -static inline __attribute__((const)) +static __always_inline __attribute__((const)) bool is_power_of_2(unsigned long n) { return (n != 0 && ((n & (n - 1)) == 0)); -- cgit v1.2.3 From 9b443b68d97983dfb9a92009a5c951364fa35985 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 5 Mar 2025 10:49:39 +0100 Subject: gpiolib: fix kerneldoc Add missing '@' to the kernel doc for the new of_node_instance_match field of struct gpio_chip. Fixes: bd3ce71078bd ("gpiolib: of: Handle threecell GPIO chips") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20250305203929.70283b9b@canb.auug.org.au/ Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250305094939.40011-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index e6e5304c99ca..4c0294a9104d 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -540,7 +540,7 @@ struct gpio_chip { unsigned int of_gpio_n_cells; /** - * of_node_instance_match: + * @of_node_instance_match: * * Determine if a chip is the right instance. Must be implemented by * any driver using more than one gpio_chip per device tree node. -- cgit v1.2.3 From 4fbfa17f9a075593281034f566ca79cbf4930c82 Mon Sep 17 00:00:00 2001 From: Shradha Todi Date: Fri, 21 Feb 2025 18:45:46 +0530 Subject: PCI: dwc: Add debugfs based Silicon Debug support for DWC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to provide Silicon Debug interface to userspace. This set of debug registers are part of the RAS DES feature present in DesignWare PCIe controllers. Co-developed-by: Manivannan Sadhasivam Signed-off-by: Shradha Todi Reviewed-by: Manivannan Sadhasivam Reviewed-by: Fan Ni Tested-by: Hrishikesh Deleep Link: https://lore.kernel.org/r/20250221131548.59616-4-shradha.t@samsung.com [kwilczynski: commit log, tidy up Kconfig and drop "default y", tidy up code comments, squashed patch that fixes a NULL pointer dereference when debugfs is already unavailable during clean-up from https://lore.kernel.org/linux-pci/20250225171239.19574-2-manivannan.sadhasivam@linaro.org, refactor dwc_pcie_debugfs_init() to not return errors, squashed patch that changes how lack of the RAS DES capability is handled from https://lore.kernel.org/linux-pci/20250304151814.6xu7cbpwpqrvcad5@thinkpad] Signed-off-by: Krzysztof Wilczyński --- include/linux/pcie-dwc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pcie-dwc.h b/include/linux/pcie-dwc.h index 6e4e1307ad6e..007b3f1b7b17 100644 --- a/include/linux/pcie-dwc.h +++ b/include/linux/pcie-dwc.h @@ -28,6 +28,8 @@ static const struct dwc_pcie_vsec_id dwc_pcie_rasdes_vsec_ids[] = { .vsec_id = 0x02, .vsec_rev = 0x4 }, { .vendor_id = PCI_VENDOR_ID_QCOM, .vsec_id = 0x02, .vsec_rev = 0x4 }, + { .vendor_id = PCI_VENDOR_ID_SAMSUNG, + .vsec_id = 0x02, .vsec_rev = 0x4 }, {} }; -- cgit v1.2.3 From 20bbb083bbc9d3f8db390f2e35e168f1b23dae8a Mon Sep 17 00:00:00 2001 From: Shawn Lin Date: Tue, 18 Feb 2025 10:21:21 +0100 Subject: PCI: Add Rockchip Vendor ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move PCI_VENDOR_ID_ROCKCHIP from pci_endpoint_test.c to pci_ids.h and reuse it in pcie-rockchip-host.c. Link: https://lore.kernel.org/r/20250218092120.2322784-2-cassel@kernel.org Signed-off-by: Shawn Lin Signed-off-by: Niklas Cassel Signed-off-by: Bjorn Helgaas Signed-off-by: Krzysztof Wilczyński --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index de5deb1a0118..e1a270e7e0c5 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2609,6 +2609,8 @@ #define PCI_VENDOR_ID_ZHAOXIN 0x1d17 +#define PCI_VENDOR_ID_ROCKCHIP 0x1d87 + #define PCI_VENDOR_ID_HYGON 0x1d94 #define PCI_VENDOR_ID_META 0x1d9b -- cgit v1.2.3 From 046e9b981d75df6b4ce8c42136e89cda8b6e9c14 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 25 Feb 2025 15:56:58 +0100 Subject: PCI: dwc: Add Rockchip to the RAS DES allowed vendor list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add PCI_VENDOR_ID_ROCKCHIP to the list of RAS DES vendor specific IDs. Signed-off-by: Niklas Cassel Link: https://lore.kernel.org/r/20250225145657.944925-2-cassel@kernel.org [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński --- include/linux/pcie-dwc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pcie-dwc.h b/include/linux/pcie-dwc.h index 007b3f1b7b17..8ff778e7aec0 100644 --- a/include/linux/pcie-dwc.h +++ b/include/linux/pcie-dwc.h @@ -28,6 +28,8 @@ static const struct dwc_pcie_vsec_id dwc_pcie_rasdes_vsec_ids[] = { .vsec_id = 0x02, .vsec_rev = 0x4 }, { .vendor_id = PCI_VENDOR_ID_QCOM, .vsec_id = 0x02, .vsec_rev = 0x4 }, + { .vendor_id = PCI_VENDOR_ID_ROCKCHIP, + .vsec_id = 0x02, .vsec_rev = 0x4 }, { .vendor_id = PCI_VENDOR_ID_SAMSUNG, .vsec_id = 0x02, .vsec_rev = 0x4 }, {} -- cgit v1.2.3 From b4de0e9be963b95c46c4a5426e94059923d236d6 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 3 Mar 2025 17:11:10 +0000 Subject: iomap: Rename IOMAP_ATOMIC -> IOMAP_ATOMIC_HW In future xfs will support a SW-based atomic write, so rename IOMAP_ATOMIC -> IOMAP_ATOMIC_HW to be clear which mode is being used. Also relocate setting of IOMAP_ATOMIC_HW to the write path in __iomap_dio_rw(), to be clear that this flag is only relevant to writes. Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250303171120.2837067-3-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index ea29388b2fba..87cd7079aaf3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -189,7 +189,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC (1 << 9) +#define IOMAP_ATOMIC_HW (1 << 9) #define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { -- cgit v1.2.3 From 794ca29dcc924cd3f16d12b6fba61074c992b8fd Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 3 Mar 2025 17:11:13 +0000 Subject: iomap: Support SW-based atomic writes Currently atomic write support requires dedicated HW support. This imposes a restriction on the filesystem that disk blocks need to be aligned and contiguously mapped to FS blocks to issue atomic writes. XFS has no method to guarantee FS block alignment for regular, non-RT files. As such, atomic writes are currently limited to 1x FS block there. To deal with the scenario that we are issuing an atomic write over misaligned or discontiguous data blocks - and raise the atomic write size limit - support a SW-based software emulated atomic write mode. For XFS, this SW-based atomic writes would use CoW support to issue emulated untorn writes. It is the responsibility of the FS to detect discontiguous atomic writes and switch to IOMAP_DIO_ATOMIC_SW mode and retry the write. Indeed, SW-based atomic writes could be used always when the mounted bdev does not support HW offload, but this strategy is not initially expected to be used. Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250303171120.2837067-6-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- include/linux/iomap.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 87cd7079aaf3..9cd93530013c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -189,8 +189,9 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC_HW (1 << 9) +#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */ #define IOMAP_DONTCACHE (1 << 10) +#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */ struct iomap_ops { /* @@ -502,6 +503,11 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Use software-based torn-write protection. + */ +#define IOMAP_DIO_ATOMIC_SW (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); -- cgit v1.2.3 From d6834d9c990333bfa433bc1816e2417f268eebbe Mon Sep 17 00:00:00 2001 From: Li Huafei Date: Tue, 22 Oct 2024 03:30:03 +0800 Subject: watchdog/hardlockup/perf: Fix perf_event memory leak During stress-testing, we found a kmemleak report for perf_event: unreferenced object 0xff110001410a33e0 (size 1328): comm "kworker/4:11", pid 288, jiffies 4294916004 hex dump (first 32 bytes): b8 be c2 3b 02 00 11 ff 22 01 00 00 00 00 ad de ...;...."....... f0 33 0a 41 01 00 11 ff f0 33 0a 41 01 00 11 ff .3.A.....3.A.... backtrace (crc 24eb7b3a): [<00000000e211b653>] kmem_cache_alloc_node_noprof+0x269/0x2e0 [<000000009d0985fa>] perf_event_alloc+0x5f/0xcf0 [<00000000084ad4a2>] perf_event_create_kernel_counter+0x38/0x1b0 [<00000000fde96401>] hardlockup_detector_event_create+0x50/0xe0 [<0000000051183158>] watchdog_hardlockup_enable+0x17/0x70 [<00000000ac89727f>] softlockup_start_fn+0x15/0x40 ... Our stress test includes CPU online and offline cycles, and updating the watchdog configuration. After reading the code, I found that there may be a race between cleaning up perf_event after updating watchdog and disabling event when the CPU goes offline: CPU0 CPU1 CPU2 (update watchdog) (hotplug offline CPU1) ... _cpu_down(CPU1) cpus_read_lock() // waiting for cpu lock softlockup_start_all smp_call_on_cpu(CPU1) softlockup_start_fn ... watchdog_hardlockup_enable(CPU1) perf create E1 watchdog_ev[CPU1] = E1 cpus_read_unlock() cpus_write_lock() cpuhp_kick_ap_work(CPU1) cpuhp_thread_fun ... watchdog_hardlockup_disable(CPU1) watchdog_ev[CPU1] = NULL dead_event[CPU1] = E1 __lockup_detector_cleanup for each dead_events_mask release each dead_event /* * CPU1 has not been added to * dead_events_mask, then E1 * will not be released */ CPU1 -> dead_events_mask cpumask_clear(&dead_events_mask) // dead_events_mask is cleared, E1 is leaked In this case, the leaked perf_event E1 matches the perf_event leak reported by kmemleak. Due to the low probability of problem recurrence (only reported once), I added some hack delays in the code: static void __lockup_detector_reconfigure(void) { ... watchdog_hardlockup_start(); cpus_read_unlock(); + mdelay(100); /* * Must be called outside the cpus locked section to prevent * recursive locking in the perf code. ... } void watchdog_hardlockup_disable(unsigned int cpu) { ... perf_event_disable(event); this_cpu_write(watchdog_ev, NULL); this_cpu_write(dead_event, event); + mdelay(100); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); atomic_dec(&watchdog_cpus); ... } void hardlockup_detector_perf_cleanup(void) { ... perf_event_release_kernel(event); per_cpu(dead_event, cpu) = NULL; } + mdelay(100); cpumask_clear(&dead_events_mask); } Then, simultaneously performing CPU on/off and switching watchdog, it is almost certain to reproduce this leak. The problem here is that releasing perf_event is not within the CPU hotplug read-write lock. Commit: 941154bd6937 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock") introduced deferred release to solve the deadlock caused by calling get_online_cpus() when releasing perf_event. Later, commit: efe951d3de91 ("perf/x86: Fix perf,x86,cpuhp deadlock") removed the get_online_cpus() call on the perf_event release path to solve another deadlock problem. Therefore, it is now possible to move the release of perf_event back into the CPU hotplug read-write lock, and release the event immediately after disabling it. Fixes: 941154bd6937 ("watchdog/hardlockup/perf: Prevent CPU hotplug deadlock") Signed-off-by: Li Huafei Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20241021193004.308303-1-lihuafei1@huawei.com --- include/linux/nmi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a8dfb38c9bb6..e78fa535f61d 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -17,7 +17,6 @@ void lockup_detector_init(void); void lockup_detector_retry_init(void); void lockup_detector_soft_poweroff(void); -void lockup_detector_cleanup(void); extern int watchdog_user_enabled; extern int watchdog_thresh; @@ -37,7 +36,6 @@ extern int sysctl_hardlockup_all_cpu_backtrace; static inline void lockup_detector_init(void) { } static inline void lockup_detector_retry_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } -static inline void lockup_detector_cleanup(void) { } #endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -104,12 +102,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs); #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); -extern void hardlockup_detector_perf_cleanup(void); extern void hardlockup_config_perf_event(const char *str); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } -static inline void hardlockup_detector_perf_cleanup(void) { } static inline void hardlockup_config_perf_event(const char *str) { } #endif -- cgit v1.2.3 From fa6192adc32f4fdfe5b74edd5b210e12afd6ecc0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 12 Feb 2025 23:04:33 +0100 Subject: uprobes/x86: Harden uretprobe syscall trampoline check Jann reported a possible issue when trampoline_check_ip returns address near the bottom of the address space that is allowed to call into the syscall if uretprobes are not set up: https://lore.kernel.org/bpf/202502081235.5A6F352985@keescook/T/#m9d416df341b8fbc11737dacbcd29f0054413cbbf Though the mmap minimum address restrictions will typically prevent creating mappings there, let's make sure uretprobe syscall checks for that. Fixes: ff474a78cef5 ("uprobe: Add uretprobe syscall to speed up return probe") Reported-by: Jann Horn Signed-off-by: Jiri Olsa Signed-off-by: Ingo Molnar Reviewed-by: Oleg Nesterov Reviewed-by: Kees Cook Acked-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250212220433.3624297-1-jolsa@kernel.org --- include/linux/uprobes.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index a40efdda9052..2e46b69ff0a6 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -39,6 +39,8 @@ struct page; #define MAX_URETPROBE_DEPTH 64 +#define UPROBE_NO_TRAMPOLINE_VADDR (~0UL) + struct uprobe_consumer { /* * handler() can return UPROBE_HANDLER_REMOVE to signal the need to -- cgit v1.2.3 From c8775aefba959cdfbaa25408a84d3dd15bbeb991 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Thu, 27 Feb 2025 15:55:05 +0800 Subject: badblocks: return boolean from badblocks_set() and badblocks_clear() Change the return type of badblocks_set() and badblocks_clear() from int to bool, indicating success or failure. Specifically: - _badblocks_set() and _badblocks_clear() functions now return true for success and false for failure. - All calls to these functions are updated to handle the new boolean return type. - This change improves code clarity and ensures a more consistent handling of success and failure states. Signed-off-by: Zheng Qixing Reviewed-by: Yu Kuai Acked-by: Coly Li Acked-by: Ira Weiny Link: https://lore.kernel.org/r/20250227075507.151331-11-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/badblocks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 670f2dae692f..8764bed9ff16 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -50,9 +50,9 @@ struct badblocks_context { int badblocks_check(struct badblocks *bb, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors); -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors); +bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors); void ack_all_badblocks(struct badblocks *bb); ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, -- cgit v1.2.3 From d301f164c3fbff611bd71f57dfa553b9219f0f5e Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Thu, 27 Feb 2025 15:55:07 +0800 Subject: badblocks: use sector_t instead of int to avoid truncation of badblocks length There is a truncation of badblocks length issue when set badblocks as follow: echo "2055 4294967299" > bad_blocks cat bad_blocks 2055 3 Change 'sectors' argument type from 'int' to 'sector_t'. This change avoids truncation of badblocks length for large sectors by replacing 'int' with 'sector_t' (u64), enabling proper handling of larger disk sizes and ensuring compatibility with 64-bit sector addressing. Fixes: 9e0e252a048b ("badblocks: Add core badblock management code") Signed-off-by: Zheng Qixing Reviewed-by: Yu Kuai Acked-by: Coly Li Link: https://lore.kernel.org/r/20250227075507.151331-13-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/badblocks.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/badblocks.h b/include/linux/badblocks.h index 8764bed9ff16..996493917f36 100644 --- a/include/linux/badblocks.h +++ b/include/linux/badblocks.h @@ -48,11 +48,11 @@ struct badblocks_context { int ack; }; -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); -bool badblocks_set(struct badblocks *bb, sector_t s, int sectors, +int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors); +bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors, int acknowledged); -bool badblocks_clear(struct badblocks *bb, sector_t s, int sectors); +bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors); void ack_all_badblocks(struct badblocks *bb); ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, -- cgit v1.2.3 From 74d42bdb3a4673b1c10d1f457184e4d3c9cb0196 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Mar 2025 07:30:42 -1000 Subject: fs/pipe: express 'pipe_empty()' in terms of 'pipe_occupancy()' That's what 'pipe_full()' does, so it's more consistent. But more importantly it gets the type limits right when the pipe head and tail are no longer necessarily 'unsigned int'. Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index e572e6fc4f81..4d0a2267e6ef 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -177,23 +177,23 @@ static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe) } /** - * pipe_empty - Return true if the pipe is empty + * pipe_occupancy - Return number of slots used in the pipe * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ -static inline bool pipe_empty(unsigned int head, unsigned int tail) +static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { - return head == tail; + return (pipe_index_t)(head - tail); } /** - * pipe_occupancy - Return number of slots used in the pipe + * pipe_empty - Return true if the pipe is empty * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ -static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) +static inline bool pipe_empty(unsigned int head, unsigned int tail) { - return (pipe_index_t)(head - tail); + return !pipe_occupancy(head, tail); } /** -- cgit v1.2.3 From 35b98180ec989db5dfdd73c49a15b5047f243edb Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Thu, 13 Feb 2025 11:39:51 +0000 Subject: tracing: Remove orphaned event_trace_printk The event_trace_printk macro has no callers since commit b8e65554d80b ("tracing: remove deprecated TRACE_FORMAT"). So drop it. Link: https://lore.kernel.org/20250213113951.813258-1-hengqi.chen@gmail.com Signed-off-by: Hengqi Chen Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 5caea596fef0..fa9cf4292dff 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -859,24 +859,6 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); int trace_array_set_clr_event(struct trace_array *tr, const char *system, const char *event, bool enable); -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...) \ -do { \ - __trace_printk_check_format(fmt, ##args); \ - tracing_record_cmdline(current); \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ - __section("__trace_printk_fmt") = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_bprintk(ip, trace_printk_fmt, ##args); \ - } else \ - __trace_printk(ip, fmt, ##args); \ -} while (0) #ifdef CONFIG_PERF_EVENTS struct perf_event; -- cgit v1.2.3 From 2e4986cf2d525eed3a240b7821f89ca45cf36d78 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 27 Feb 2025 20:26:29 -0400 Subject: fwctl: Add basic structure for a class subsystem with a cdev Create the class, character device and functions for a fwctl driver to un/register to the subsystem. A typical fwctl driver has a sysfs presence like: $ ls -l /dev/fwctl/fwctl0 crw------- 1 root root 250, 0 Apr 25 19:16 /dev/fwctl/fwctl0 $ ls /sys/class/fwctl/fwctl0 dev device power subsystem uevent $ ls /sys/class/fwctl/fwctl0/device/infiniband/ ibp0s10f0 $ ls /sys/class/infiniband/ibp0s10f0/device/fwctl/ fwctl0/ $ ls /sys/devices/pci0000:00/0000:00:0a.0/fwctl/fwctl0 dev device power subsystem uevent Which allows userspace to link all the multi-subsystem driver components together and learn the subsystem specific names for the device's components. Link: https://patch.msgid.link/r/1-v5-642aa0c94070+4447f-fwctl_jgg@nvidia.com Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Reviewed-by: Shannon Nelson Tested-by: Dave Jiang Tested-by: Shannon Nelson Signed-off-by: Jason Gunthorpe --- include/linux/fwctl.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 include/linux/fwctl.h (limited to 'include/linux') diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h new file mode 100644 index 000000000000..39d5059c9e59 --- /dev/null +++ b/include/linux/fwctl.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __LINUX_FWCTL_H +#define __LINUX_FWCTL_H +#include +#include +#include + +struct fwctl_device; +struct fwctl_uctx; + +struct fwctl_ops { +}; + +/** + * struct fwctl_device - Per-driver registration struct + * @dev: The sysfs (class/fwctl/fwctlXX) device + * + * Each driver instance will have one of these structs with the driver private + * data following immediately after. This struct is refcounted, it is freed by + * calling fwctl_put(). + */ +struct fwctl_device { + struct device dev; + /* private: */ + struct cdev cdev; + const struct fwctl_ops *ops; +}; + +struct fwctl_device *_fwctl_alloc_device(struct device *parent, + const struct fwctl_ops *ops, + size_t size); +/** + * fwctl_alloc_device - Allocate a fwctl + * @parent: Physical device that provides the FW interface + * @ops: Driver ops to register + * @drv_struct: 'struct driver_fwctl' that holds the struct fwctl_device + * @member: Name of the struct fwctl_device in @drv_struct + * + * This allocates and initializes the fwctl_device embedded in the drv_struct. + * Upon success the pointer must be freed via fwctl_put(). Returns a 'drv_struct + * \*' on success, NULL on error. + */ +#define fwctl_alloc_device(parent, ops, drv_struct, member) \ + ({ \ + static_assert(__same_type(struct fwctl_device, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_fwctl_alloc_device(parent, ops, \ + sizeof(drv_struct)); \ + }) + +static inline struct fwctl_device *fwctl_get(struct fwctl_device *fwctl) +{ + get_device(&fwctl->dev); + return fwctl; +} +static inline void fwctl_put(struct fwctl_device *fwctl) +{ + put_device(&fwctl->dev); +} +DEFINE_FREE(fwctl, struct fwctl_device *, if (_T) fwctl_put(_T)); + +int fwctl_register(struct fwctl_device *fwctl); +void fwctl_unregister(struct fwctl_device *fwctl); + +#endif -- cgit v1.2.3 From 0e79a47fb197b6937709a2af2a138c526a9bc374 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 27 Feb 2025 20:26:30 -0400 Subject: fwctl: Basic ioctl dispatch for the character device Each file descriptor gets a chunk of per-FD driver specific context that allows the driver to attach a device specific struct to. The core code takes care of the memory lifetime for this structure. The ioctl dispatch and design is based on what was built for iommufd. The ioctls have a struct which has a combined in/out behavior with a typical 'zero pad' scheme for future extension and backwards compatibility. Like iommufd some shared logic does most of the ioctl marshaling and compatibility work and table dispatches to some function pointers for each unique ioctl. This approach has proven to work quite well in the iommufd and rdma subsystems. Allocate an ioctl number space for the subsystem. Link: https://patch.msgid.link/r/2-v5-642aa0c94070+4447f-fwctl_jgg@nvidia.com Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Shannon Nelson Tested-by: Dave Jiang Tested-by: Shannon Nelson Signed-off-by: Jason Gunthorpe --- include/linux/fwctl.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h index 39d5059c9e59..faa4b2c780e0 100644 --- a/include/linux/fwctl.h +++ b/include/linux/fwctl.h @@ -11,7 +11,30 @@ struct fwctl_device; struct fwctl_uctx; +/** + * struct fwctl_ops - Driver provided operations + * + * fwctl_unregister() will wait until all excuting ops are completed before it + * returns. Drivers should be mindful to not let their ops run for too long as + * it will block device hot unplug and module unloading. + */ struct fwctl_ops { + /** + * @uctx_size: The size of the fwctl_uctx struct to allocate. The first + * bytes of this memory will be a fwctl_uctx. The driver can use the + * remaining bytes as its private memory. + */ + size_t uctx_size; + /** + * @open_uctx: Called when a file descriptor is opened before the uctx + * is ever used. + */ + int (*open_uctx)(struct fwctl_uctx *uctx); + /** + * @close_uctx: Called when the uctx is destroyed, usually when the FD + * is closed. + */ + void (*close_uctx)(struct fwctl_uctx *uctx); }; /** @@ -26,6 +49,15 @@ struct fwctl_device { struct device dev; /* private: */ struct cdev cdev; + + /* Protect uctx_list */ + struct mutex uctx_list_lock; + struct list_head uctx_list; + /* + * Protect ops, held for write when ops becomes NULL during unregister, + * held for read whenever ops is loaded or an ops function is running. + */ + struct rw_semaphore registration_lock; const struct fwctl_ops *ops; }; @@ -66,4 +98,18 @@ DEFINE_FREE(fwctl, struct fwctl_device *, if (_T) fwctl_put(_T)); int fwctl_register(struct fwctl_device *fwctl); void fwctl_unregister(struct fwctl_device *fwctl); +/** + * struct fwctl_uctx - Per user FD context + * @fwctl: fwctl instance that owns the context + * + * Every FD opened by userspace will get a unique context allocation. Any driver + * private data will follow immediately after. + */ +struct fwctl_uctx { + struct fwctl_device *fwctl; + /* private: */ + /* Head at fwctl_device::uctx_list */ + struct list_head uctx_list_entry; +}; + #endif -- cgit v1.2.3 From fb39e9092be5a18eaab05b5a2492741fe6e395fe Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 27 Feb 2025 20:26:31 -0400 Subject: fwctl: FWCTL_INFO to return basic information about the device Userspace will need to know some details about the fwctl interface being used to locate the correct userspace code to communicate with the kernel. Provide a simple device_type enum indicating what the kernel driver is. Allow the device to provide a device specific info struct that contains any additional information that the driver may need to provide to userspace. Link: https://patch.msgid.link/r/3-v5-642aa0c94070+4447f-fwctl_jgg@nvidia.com Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Shannon Nelson Tested-by: Dave Jiang Tested-by: Shannon Nelson Signed-off-by: Jason Gunthorpe --- include/linux/fwctl.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h index faa4b2c780e0..700a5be940e3 100644 --- a/include/linux/fwctl.h +++ b/include/linux/fwctl.h @@ -7,6 +7,7 @@ #include #include #include +#include struct fwctl_device; struct fwctl_uctx; @@ -19,6 +20,10 @@ struct fwctl_uctx; * it will block device hot unplug and module unloading. */ struct fwctl_ops { + /** + * @device_type: The drivers assigned device_type number. This is uABI. + */ + enum fwctl_device_type device_type; /** * @uctx_size: The size of the fwctl_uctx struct to allocate. The first * bytes of this memory will be a fwctl_uctx. The driver can use the @@ -35,6 +40,13 @@ struct fwctl_ops { * is closed. */ void (*close_uctx)(struct fwctl_uctx *uctx); + /** + * @info: Implement FWCTL_INFO. Return a kmalloc() memory that is copied + * to out_device_data. On input length indicates the size of the user + * buffer on output it indicates the size of the memory. The driver can + * ignore length on input, the core code will handle everything. + */ + void *(*info)(struct fwctl_uctx *uctx, size_t *length); }; /** -- cgit v1.2.3 From 8eea4e74475804285507c077bec87d40be87ff06 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 27 Feb 2025 20:26:32 -0400 Subject: taint: Add TAINT_FWCTL Requesting a fwctl scope of access that includes mutating device debug data will cause the kernel to be tainted. Changing the device operation through things in the debug scope may cause the device to malfunction in undefined ways. This should be reflected in the TAINT flags to help any debuggers understand that something has been done. Link: https://patch.msgid.link/r/4-v5-642aa0c94070+4447f-fwctl_jgg@nvidia.com Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Shannon Nelson Tested-by: Dave Jiang Tested-by: Shannon Nelson Signed-off-by: Jason Gunthorpe --- include/linux/panic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/panic.h b/include/linux/panic.h index 54d90b6c5f47..2494d51707ef 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -74,7 +74,8 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout) #define TAINT_AUX 16 #define TAINT_RANDSTRUCT 17 #define TAINT_TEST 18 -#define TAINT_FLAGS_COUNT 19 +#define TAINT_FWCTL 19 +#define TAINT_FLAGS_COUNT 20 #define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) struct taint_flag { -- cgit v1.2.3 From 840cfb7cf570b681f5d20e19f7c2675a9d991732 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 27 Feb 2025 20:26:33 -0400 Subject: fwctl: FWCTL_RPC to execute a Remote Procedure Call to device firmware Add the FWCTL_RPC ioctl which allows a request/response RPC call to device firmware. Drivers implementing this call must follow the security guidelines under Documentation/userspace-api/fwctl.rst The core code provides some memory management helpers to get the messages copied from and back to userspace. The driver is responsible for allocating the output message memory and delivering the message to the device. Link: https://patch.msgid.link/r/5-v5-642aa0c94070+4447f-fwctl_jgg@nvidia.com Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Shannon Nelson Tested-by: Dave Jiang Tested-by: Shannon Nelson Signed-off-by: Jason Gunthorpe --- include/linux/fwctl.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h index 700a5be940e3..5d61fc8a6871 100644 --- a/include/linux/fwctl.h +++ b/include/linux/fwctl.h @@ -47,6 +47,14 @@ struct fwctl_ops { * ignore length on input, the core code will handle everything. */ void *(*info)(struct fwctl_uctx *uctx, size_t *length); + /** + * @fw_rpc: Implement FWCTL_RPC. Deliver rpc_in/in_len to the FW and + * return the response and set out_len. rpc_in can be returned as the + * response pointer. Otherwise the returned pointer is freed with + * kvfree(). + */ + void *(*fw_rpc)(struct fwctl_uctx *uctx, enum fwctl_rpc_scope scope, + void *rpc_in, size_t in_len, size_t *out_len); }; /** -- cgit v1.2.3 From a14fa8ec9d811c1cef902345b58294b589655b41 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Wed, 5 Mar 2025 17:26:27 -0800 Subject: mm/migrate: Add migrate_device_pfns Add migrate_device_pfns which prepares an array of pre-populated device pages for migration. This is needed for eviction of known set of non-contiguous devices pages to cpu pages which is a common case for SVM in DRM drivers using TTM. v2: - s/migrate_device_vma_range/migrate_device_prepopulated_range - Drop extra mmu invalidation (Vetter) v3: - s/migrate_device_prepopulated_range/migrate_device_pfns (Alistar) - Use helper to lock device pages (Alistar) - Update commit message with why this is required (Alistar) Cc: Andrew Morton Signed-off-by: Matthew Brost Reviewed-by: Alistair Popple Reviewed-by: Gwan-gyeong Mun Link: https://patchwork.freedesktop.org/patch/msgid/20250306012657.3505757-3-matthew.brost@intel.com --- include/linux/migrate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 29919faea2f1..80891120cca9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -227,6 +227,7 @@ void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); int migrate_device_range(unsigned long *src_pfns, unsigned long start, unsigned long npages); +int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages); void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, unsigned long npages); void migrate_device_finalize(unsigned long *src_pfns, -- cgit v1.2.3 From 860a731f52f83309c213b943bac8f4ea70a88805 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 5 Mar 2025 22:08:21 +0100 Subject: PM: EM: Consify two parameters of em_dev_register_perf_domain() Notice that em_dev_register_perf_domain() and the functions called by it do not update objects pointed to by its cb and cpus parameters, so the const modifier can be added to them. This allows the return value of cpumask_of() or a pointer to a struct em_data_callback declared as const to be passed to em_dev_register_perf_domain() directly without explicit type casting which is rather handy. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/4648962.LvFx2qVVIh@rjwysocki.net --- include/linux/energy_model.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 78318d49276d..b23c8c798dac 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -169,8 +169,8 @@ struct em_perf_domain *em_pd_get(struct device *dev); int em_dev_update_perf_domain(struct device *dev, struct em_perf_table __rcu *new_table); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span, - bool microwatts); + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table __rcu *table); @@ -346,8 +346,8 @@ struct em_data_callback {}; static inline int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span, - bool microwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { return -EINVAL; } -- cgit v1.2.3 From d4c22ec680c8db832ffc0b964c6008e65436cba8 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:19 -0800 Subject: net: hold netdev instance lock during ndo_open/ndo_stop For the drivers that use shaper API, switch to the mode where core stack holds the netdev lock. This affects two drivers: * iavf - already grabs netdev lock in ndo_open/ndo_stop, so mostly remove these * netdevsim - switch to _locked APIs to avoid deadlock iavf_close diff is a bit confusing, the existing call looks like this: iavf_close() { netdev_lock() .. netdev_unlock() wait_event_timeout(down_waitqueue) } I change it to the following: netdev_lock() iavf_close() { .. netdev_unlock() wait_event_timeout(down_waitqueue) netdev_lock() // reusing this lock call } netdev_unlock() Since I'm reusing existing netdev_lock call, so it looks like I only add netdev_unlock. Cc: Saeed Mahameed Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-2-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ab86ec228b7..33066b155c84 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2753,6 +2753,29 @@ static inline void netdev_assert_locked_or_invisible(struct net_device *dev) netdev_assert_locked(dev); } +static inline bool netdev_need_ops_lock(struct net_device *dev) +{ + bool ret = false; + +#if IS_ENABLED(CONFIG_NET_SHAPER) + ret |= !!dev->netdev_ops->net_shaper_ops; +#endif + + return ret; +} + +static inline void netdev_lock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); +} + +static inline void netdev_unlock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); +} + void netif_napi_set_irq_locked(struct napi_struct *napi, int irq); static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) -- cgit v1.2.3 From c4f0f30b424e7258a777bcbcbf9006207da4854c Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:20 -0800 Subject: net: hold netdev instance lock during nft ndo_setup_tc Introduce new dev_setup_tc for nft ndo_setup_tc paths. Reviewed-by: Eric Dumazet Cc: Saeed Mahameed Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-3-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 33066b155c84..69951eeb96d2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3353,6 +3353,8 @@ int dev_alloc_name(struct net_device *dev, const char *name); int dev_open(struct net_device *dev, struct netlink_ext_ack *extack); void dev_close(struct net_device *dev); void dev_close_many(struct list_head *head, bool unlink); +int dev_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, -- cgit v1.2.3 From cae03e5bdd9e0c8570506c50f1f234da40201732 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:23 -0800 Subject: net: hold netdev instance lock during queue operations For the drivers that use queue management API, switch to the mode where core stack holds the netdev instance lock. This affects the following drivers: - bnxt - gve - netdevsim Originally I locked only start/stop, but switched to holding the lock over all iterations to make them look atomic to the device (feels like it should be easier to reason about). Reviewed-by: Eric Dumazet Cc: Saeed Mahameed Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-6-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69951eeb96d2..abda17b15950 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2755,7 +2755,7 @@ static inline void netdev_assert_locked_or_invisible(struct net_device *dev) static inline bool netdev_need_ops_lock(struct net_device *dev) { - bool ret = false; + bool ret = !!dev->queue_mgmt_ops; #if IS_ENABLED(CONFIG_NET_SHAPER) ret |= !!dev->netdev_ops->net_shaper_ops; -- cgit v1.2.3 From 7e4d784f5810bba76c4593791028e13cce4af547 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:24 -0800 Subject: net: hold netdev instance lock during rtnetlink operations To preserve the atomicity, hold the lock while applying multiple attributes. The major issue with a full conversion to the instance lock are software nesting devices (bonding/team/vrf/etc). Those devices call into the core stack for their lower (potentially real hw) devices. To avoid explicitly wrapping all those places into instance lock/unlock, introduce new API boundaries: - (some) existing dev_xxx calls are now considered "external" (to drivers) APIs and they transparently grab the instance lock if needed (dev_api.c) - new netif_xxx calls are internal core stack API (naming is sketchy, I've tried netdev_xxx_locked per Jakub's suggestion, but it feels a bit verbose; but happy to get back to this naming scheme if this is the preference) This avoids touching most of the existing ioctl/sysfs/drivers paths. Note the special handling of ndo_xxx_slave operations: I exploit the fact that none of the drivers that call these functions need/use instance lock. At the same time, they use dev_xxx APIs, so the lower device has to be unlocked. Changes in unregister_netdevice_many_notify (to protect dev->state with instance lock) trigger lockdep - the loop over close_list (mostly from cleanup_net) introduces spurious ordering issues. netdev_lock_cmp_fn has a justification on why it's ok to suppress for now. Cc: Saeed Mahameed Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-7-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index abda17b15950..be3d09b61e95 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2620,16 +2620,35 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, f(dev, &dev->_tx[i], arg); } +static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) +{ + /* Only lower devices currently grab the instance lock, so no + * real ordering issues can occur. In the near future, only + * hardware devices will grab instance lock which also does not + * involve any ordering. Suppress lockdep ordering warnings + * until (if) we start grabbing instance lock on pure SW + * devices (bond/team/veth/etc). + */ + if (a == b) + return 0; + return -1; +} + #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ + static struct lock_class_key dev_instance_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ + lockdep_set_class(&(dev)->lock, \ + &dev_instance_lock_key); \ + lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ &qdisc_xmit_lock_key); \ @@ -2776,6 +2795,12 @@ static inline void netdev_unlock_ops(struct net_device *dev) netdev_unlock(dev); } +static inline void netdev_ops_assert_locked(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + lockdep_assert_held(&dev->lock); +} + void netif_napi_set_irq_locked(struct napi_struct *napi, int irq); static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) @@ -3350,7 +3375,9 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); bool netdev_name_in_use(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); +int netif_open(struct net_device *dev, struct netlink_ext_ack *extack); int dev_open(struct net_device *dev, struct netlink_ext_ack *extack); +void netif_close(struct net_device *dev); void dev_close(struct net_device *dev); void dev_close_many(struct list_head *head, bool unlink); int dev_setup_tc(struct net_device *dev, enum tc_setup_type type, @@ -4211,25 +4238,26 @@ int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); +int netif_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack); int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); +int netif_set_alias(struct net_device *dev, const char *alias, size_t len); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); -int __dev_change_net_namespace(struct net_device *dev, struct net *net, +int netif_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex, struct netlink_ext_ack *extack); -static inline int dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat) -{ - return __dev_change_net_namespace(dev, net, pat, 0, NULL); -} + const char *pat); int __dev_set_mtu(struct net_device *, int); int dev_set_mtu(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); +int netif_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); -- cgit v1.2.3 From ffb7ed19ac0a9fa9ea79af1d7b42c03a10da98a5 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:25 -0800 Subject: net: hold netdev instance lock during ioctl operations Convert all ndo_eth_ioctl invocations to dev_eth_ioctl which does the locking. Reflow some of the dev_siocxxx to drop else clause. Cc: Saeed Mahameed Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-8-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be3d09b61e95..8d243c0ec39d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4229,6 +4229,8 @@ int put_user_ifreq(struct ifreq *ifr, void __user *arg); int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, void __user *data, bool *need_copyout); int dev_ifconf(struct net *net, struct ifconf __user *ifc); +int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd); int generic_hwtstamp_get_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg); int generic_hwtstamp_set_lower(struct net_device *dev, @@ -4251,6 +4253,7 @@ int netif_change_net_namespace(struct net_device *dev, struct net *net, int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat); int __dev_set_mtu(struct net_device *, int); +int netif_set_mtu(struct net_device *dev, int new_mtu); int dev_set_mtu(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); -- cgit v1.2.3 From ad7c7b2172c388818a111455643491d75f535e90 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:26 -0800 Subject: net: hold netdev instance lock during sysfs operations Most of them are already covered by the converted dev_xxx APIs. Add the locking wrappers for the remaining ones. Cc: Saeed Mahameed Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-9-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8d243c0ec39d..c61b12809588 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3382,6 +3382,7 @@ void dev_close(struct net_device *dev); void dev_close_many(struct list_head *head, bool unlink); int dev_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data); +void netif_disable_lro(struct net_device *dev); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, @@ -4257,6 +4258,8 @@ int netif_set_mtu(struct net_device *dev, int new_mtu); int dev_set_mtu(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); +int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int netif_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, @@ -5016,6 +5019,7 @@ static inline void __dev_mc_unsync(struct net_device *dev, /* Functions used for secondary unicast and multicast support */ void dev_set_rx_mode(struct net_device *dev); int dev_set_promiscuity(struct net_device *dev, int inc); +int netif_set_allmulti(struct net_device *dev, int inc, bool notify); int dev_set_allmulti(struct net_device *dev, int inc); void netdev_state_change(struct net_device *dev); void __netdev_notify_peers(struct net_device *dev); -- cgit v1.2.3 From 97246d6d21c21fb4c5235770a21855e457096a96 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:27 -0800 Subject: net: hold netdev instance lock during ndo_bpf Cover the paths that come via bpf system call and XSK bind. Cc: Saeed Mahameed Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-10-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c61b12809588..ca9c09dab14e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4277,6 +4277,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); +int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf); int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf); u8 dev_xdp_sb_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); -- cgit v1.2.3 From df43d8bf10316a7c3b1e47e3cc0057a54df4a5b8 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:29 -0800 Subject: net: replace dev_addr_sem with netdev instance lock Lockdep reports possible circular dependency in [0]. Instead of fixing the ordering, replace global dev_addr_sem with netdev instance lock. Most of the paths that set/get mac are RTNL protected. Two places where it's not, convert to explicit locking: - sysfs address_show - dev_get_mac_address via dev_ioctl 0: https://netdev-3.bots.linux.dev/vmksft-forwarding-dbg/results/993321/24-router-bridge-1d-lag-sh/stderr Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-12-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ca9c09dab14e..f3e6e6f27e22 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2492,7 +2492,7 @@ struct net_device { * * Protects: * @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list, - * @net_shaper_hierarchy, @reg_state, @threaded + * @net_shaper_hierarchy, @reg_state, @threaded, @dev_addr * * Partially protects (writers must hold both @lock and rtnl_lock): * @up @@ -4262,10 +4262,6 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); -int netif_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack); -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); -- cgit v1.2.3 From 605ef7aec0605bd5506b31591de278d652bd0096 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:30 -0800 Subject: net: add option to request netdev instance lock Currently only the drivers that implement shaper or queue APIs are grabbing instance lock. Add an explicit opt-in for the drivers that want to grab the lock without implementing the above APIs. There is a 3-byte hole after @up, use it: /* --- cacheline 47 boundary (3008 bytes) --- */ u32 napi_defer_hard_irqs; /* 3008 4 */ bool up; /* 3012 1 */ /* XXX 3 bytes hole, try to pack */ struct mutex lock; /* 3016 144 */ /* XXX last struct has 1 hole */ Cc: Saeed Mahameed Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-13-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f3e6e6f27e22..adf201617b72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2485,6 +2485,12 @@ struct net_device { */ bool up; + /** + * @request_ops_lock: request the core to run all @netdev_ops and + * @ethtool_ops under the @lock. + */ + bool request_ops_lock; + /** * @lock: netdev-scope lock, protects a small selection of fields. * Should always be taken using netdev_lock() / netdev_unlock() helpers. @@ -2774,7 +2780,7 @@ static inline void netdev_assert_locked_or_invisible(struct net_device *dev) static inline bool netdev_need_ops_lock(struct net_device *dev) { - bool ret = !!dev->queue_mgmt_ops; + bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; #if IS_ENABLED(CONFIG_NET_SHAPER) ret |= !!dev->netdev_ops->net_shaper_ops; -- cgit v1.2.3 From cc34acd577f1a6ed805106bfcc9a262837dbd0da Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:31 -0800 Subject: docs: net: document new locking reality Also clarify ndo_get_stats (that read and write paths can run concurrently) and mention only RCU. Cc: Saeed Mahameed Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-14-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index adf201617b72..2f8560a354ba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2505,6 +2505,10 @@ struct net_device { * * Also protects some fields in struct napi_struct. * + * For the drivers that implement shaper or queue API, the scope + * of this lock is expanded to cover most ndo/queue/ethtool/sysfs + * operations. + * * Ordering: take after rtnl_lock. */ struct mutex lock; -- cgit v1.2.3 From 004b5008016a2cc37103bf8d9968573771cd311f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 5 Mar 2025 08:37:32 -0800 Subject: eth: bnxt: remove most dependencies on RTNL Only devlink and sriov paths are grabbing rtnl explicitly. The rest is covered by netdev instance lock which the core now grabs, so there is no need to manage rtnl in most places anymore. On the core side we can now try to drop rtnl in some places (do_setlink for example) for the drivers that signal non-rtnl mode (TBD). Boot-tested and with `ethtool -L eth1 combined 24` to trigger reset. Cc: Saeed Mahameed Reviewed-by: Michael Chan Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250305163732.2766420-15-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2f8560a354ba..d206c9592b60 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2765,6 +2765,11 @@ static inline void netdev_lock(struct net_device *dev) mutex_lock(&dev->lock); } +static inline bool netdev_trylock(struct net_device *dev) +{ + return mutex_trylock(&dev->lock); +} + static inline void netdev_unlock(struct net_device *dev) { mutex_unlock(&dev->lock); -- cgit v1.2.3 From e7112524e5e885181cc5ae4d258f33b9dbe0b907 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 6 Mar 2025 08:27:51 -0800 Subject: block: Name the RQF flags enum Commit 5f89154e8e9e3445f9b59 ("block: Use enum to define RQF_x bit indexes") converted the RQF flags to an anonymous enum, which was a beneficial change. This patch goes one step further by naming the enum as "rqf_flags". This naming enables exporting these flags to BPF clients, eliminating the need to duplicate these flags in BPF code. Instead, BPF clients can now access the same kernel-side values through CO:RE (Compile Once, Run Everywhere), as shown in this example: rqf_stats = bpf_core_enum_value(enum rqf_flags, __RQF_STATS) Suggested-by: Yonghong Song Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20250306-rqf_flags-v1-1-bbd64918b406@debian.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fa2a76cc2f73..71f4f0cc3dac 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -28,7 +28,7 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); typedef __u32 __bitwise req_flags_t; /* Keep rqf_name[] in sync with the definitions below */ -enum { +enum rqf_flags { /* drive already may have started this one */ __RQF_STARTED, /* request for flush sequence */ -- cgit v1.2.3 From a2f61f1db85532e72fb8a3af51b06df94bb82912 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Tue, 4 Mar 2025 18:06:15 +0200 Subject: net/mlx5: Relocate function declarations from port.h to mlx5_core.h The port header is a general file under include, yet it contains declarations for functions that are either not exported or exported but not used outside the mlx5_core driver. To enhance code organization, we move these declarations to mlx5_core.h, where they are more appropriately scoped. This refactor removes unnecessary exported symbols and prevents unexported functions from being inadvertently referenced outside of the mlx5_core driver. Signed-off-by: Shahar Shitrit Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250304160620.417580-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/port.h | 85 +---------------------------------------------- 1 file changed, 1 insertion(+), 84 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index fd625e0dd869..58770b86f793 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -61,15 +61,6 @@ enum mlx5_an_status { #define MLX5_EEPROM_PAGE_LENGTH 256 #define MLX5_EEPROM_HIGH_PAGE_LENGTH 128 -struct mlx5_module_eeprom_query_params { - u16 size; - u16 offset; - u16 i2c_address; - u32 page; - u32 bank; - u32 module_number; -}; - enum mlx5e_link_mode { MLX5E_1000BASE_CX_SGMII = 0, MLX5E_1000BASE_KX = 1, @@ -145,12 +136,6 @@ enum mlx5_ptys_width { MLX5_PTYS_WIDTH_12X = 1 << 4, }; -struct mlx5_port_eth_proto { - u32 cap; - u32 admin; - u32 oper; -}; - #define MLX5E_PROT_MASK(link_mode) (1U << link_mode) #define MLX5_GET_ETH_PROTO(reg, out, ext, field) \ (ext ? MLX5_GET(reg, out, ext_##field) : \ @@ -163,14 +148,7 @@ int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, int mlx5_query_ib_port_oper(struct mlx5_core_dev *dev, u16 *link_width_oper, u16 *proto_oper, u8 local_port, u8 plane_index); -void mlx5_toggle_port_link(struct mlx5_core_dev *dev); -int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, - enum mlx5_port_status status); -int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, - enum mlx5_port_status *status); -int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration); - -int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port); + void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu, u8 port); void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu, u8 port); @@ -178,65 +156,4 @@ void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu, int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, u8 *vl_hw_cap, u8 local_port); -int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause); -int mlx5_query_port_pause(struct mlx5_core_dev *dev, - u32 *rx_pause, u32 *tx_pause); - -int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx); -int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, - u8 *pfc_en_rx); - -int mlx5_set_port_stall_watermark(struct mlx5_core_dev *dev, - u16 stall_critical_watermark, - u16 stall_minor_watermark); -int mlx5_query_port_stall_watermark(struct mlx5_core_dev *dev, - u16 *stall_critical_watermark, u16 *stall_minor_watermark); - -int mlx5_max_tc(struct mlx5_core_dev *mdev); - -int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc); -int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev, - u8 prio, u8 *tc); -int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group); -int mlx5_query_port_tc_group(struct mlx5_core_dev *mdev, - u8 tc, u8 *tc_group); -int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw); -int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev, - u8 tc, u8 *bw_pct); -int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev, - u8 *max_bw_value, - u8 *max_bw_unit); -int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev, - u8 *max_bw_value, - u8 *max_bw_unit); -int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode); -int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode); - -int mlx5_query_ports_check(struct mlx5_core_dev *mdev, u32 *out, int outlen); -int mlx5_set_ports_check(struct mlx5_core_dev *mdev, u32 *in, int inlen); -int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable); -void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported, - bool *enabled); -int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, - u16 offset, u16 size, u8 *data); -int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, - struct mlx5_module_eeprom_query_params *params, u8 *data); - -int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out); -int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in); - -int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state); -int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state); -int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio); -int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio); - -int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext, - struct mlx5_port_eth_proto *eproto); -bool mlx5_ptys_ext_supported(struct mlx5_core_dev *mdev); -u32 mlx5_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper, - bool force_legacy); -u32 mlx5_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed, - bool force_legacy); -int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); - #endif /* __MLX5_PORT_H__ */ -- cgit v1.2.3 From 00a7d39898c8010bfd5ff62af31ca5db34421b38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Mar 2025 18:25:35 -1000 Subject: fs/pipe: add simpler helpers for common cases The fix to atomically read the pipe head and tail state when not holding the pipe mutex has caused a number of headaches due to the size change of the involved types. It turns out that we don't have _that_ many places that access these fields directly and were affected, but we have more than we strictly should have, because our low-level helper functions have been designed to have intimate knowledge of how the pipes work. And as a result, that random noise of direct 'pipe->head' and 'pipe->tail' accesses makes it harder to pinpoint any actual potential problem spots remaining. For example, we didn't have a "is the pipe full" helper function, but instead had a "given these pipe buffer indexes and this pipe size, is the pipe full". That's because some low-level pipe code does actually want that much more complicated interface. But most other places literally just want a "is the pipe full" helper, and not having it meant that those places ended up being unnecessarily much too aware of this all. It would have been much better if only the very core pipe code that cared had been the one aware of this all. So let's fix it - better late than never. This just introduces the trivial wrappers for "is this pipe full or empty" and to get how many pipe buffers are used, so that instead of writing if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) the places that literally just want to know if a pipe is full can just say if (pipe_is_full(pipe)) instead. The existing trivial cases were converted with a 'sed' script. This cuts down on the places that access pipe->head and pipe->tail directly outside of the pipe code (and core splice code) quite a lot. The splice code in particular still revels in doing the direct low-level accesses, and the fuse fuse_dev_splice_write() code also seems a bit unnecessarily eager to go very low-level, but it's at least a bit better than it used to be. Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 4d0a2267e6ef..b698758000f8 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -208,6 +208,33 @@ static inline bool pipe_full(unsigned int head, unsigned int tail, return pipe_occupancy(head, tail) >= limit; } +/** + * pipe_is_full - Return true if the pipe is full + * @pipe: the pipe + */ +static inline bool pipe_is_full(const struct pipe_inode_info *pipe) +{ + return pipe_full(pipe->head, pipe->tail, pipe->max_usage); +} + +/** + * pipe_is_empty - Return true if the pipe is empty + * @pipe: the pipe + */ +static inline bool pipe_is_empty(const struct pipe_inode_info *pipe) +{ + return pipe_empty(pipe->head, pipe->tail); +} + +/** + * pipe_buf_usage - Return how many pipe buffers are in use + * @pipe: the pipe + */ +static inline unsigned int pipe_buf_usage(const struct pipe_inode_info *pipe) +{ + return pipe_occupancy(pipe->head, pipe->tail); +} + /** * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring * @pipe: The pipe to access -- cgit v1.2.3 From a64e5a596067bddba87fcc2ce37e56c3fca831b7 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 6 Mar 2025 18:04:03 -0800 Subject: bdev: add back PAGE_SIZE block size validation for sb_set_blocksize() The commit titled "block/bdev: lift block size restrictions to 64k" lifted the block layer's max supported block size to 64k inside the helper blk_validate_block_size() now that we support large folios. However in lifting the block size we also removed the silly use cases many filesystems have to use sb_set_blocksize() to *verify* that the block size <= PAGE_SIZE. The call to sb_set_blocksize() was used to check the block size <= PAGE_SIZE since historically we've always supported userspace to create for example 64k block size filesystems even on 4k page size systems, but what we didn't allow was mounting them. Older filesystems have been using the check with sb_set_blocksize() for years. While, we could argue that such checks should be filesystem specific, there are much more users of sb_set_blocksize() than LBS enabled filesystem on upstream, so just do the easier thing and bring back the PAGE_SIZE check for sb_set_blocksize() users and only skip it for LBS enabled filesystems. This will ensure that tests such as generic/466 when run in a loop against say, ext4, won't try to try to actually mount a filesystem with a block size larger than your filesystem supports given your PAGE_SIZE and in the worst case crash. Cc: Kent Overstreet Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20250307020403.3068567-1-mcgrof@kernel.org Reviewed-by: Kent Overstreet Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index be3ad155ec9f..f47fbe16b101 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2604,6 +2604,7 @@ struct file_system_type { #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ +#define FS_LBS 128 /* FS supports LBS */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; -- cgit v1.2.3 From 3ee7be9e10dd5f79448788b899591d4bd2bf0c19 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Mar 2025 17:49:20 +0100 Subject: PM: EM: Address RCU-related sparse warnings The usage of __rcu in the Energy Model code is quite inconsistent which causes the following sparse warnings to trigger: kernel/power/energy_model.c:169:15: warning: incorrect type in assignment (different address spaces) kernel/power/energy_model.c:169:15: expected struct em_perf_table [noderef] __rcu *table kernel/power/energy_model.c:169:15: got struct em_perf_table * kernel/power/energy_model.c:171:9: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:171:9: expected struct callback_head *head kernel/power/energy_model.c:171:9: got struct callback_head [noderef] __rcu * kernel/power/energy_model.c:171:9: warning: cast removes address space '__rcu' of expression kernel/power/energy_model.c:182:19: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:182:19: expected struct kref *kref kernel/power/energy_model.c:182:19: got struct kref [noderef] __rcu * kernel/power/energy_model.c:200:15: warning: incorrect type in assignment (different address spaces) kernel/power/energy_model.c:200:15: expected struct em_perf_table [noderef] __rcu *table kernel/power/energy_model.c:200:15: got void *[assigned] _res kernel/power/energy_model.c:204:20: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:204:20: expected struct kref *kref kernel/power/energy_model.c:204:20: got struct kref [noderef] __rcu * kernel/power/energy_model.c:320:19: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:320:19: expected struct kref *kref kernel/power/energy_model.c:320:19: got struct kref [noderef] __rcu * kernel/power/energy_model.c:325:45: warning: incorrect type in argument 2 (different address spaces) kernel/power/energy_model.c:325:45: expected struct em_perf_state *table kernel/power/energy_model.c:325:45: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:425:45: warning: incorrect type in argument 3 (different address spaces) kernel/power/energy_model.c:425:45: expected struct em_perf_state *table kernel/power/energy_model.c:425:45: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:442:15: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:442:15: expected void const *objp kernel/power/energy_model.c:442:15: got struct em_perf_table [noderef] __rcu *[assigned] em_table kernel/power/energy_model.c:626:55: warning: incorrect type in argument 2 (different address spaces) kernel/power/energy_model.c:626:55: expected struct em_perf_state *table kernel/power/energy_model.c:626:55: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:681:16: warning: incorrect type in assignment (different address spaces) kernel/power/energy_model.c:681:16: expected struct em_perf_state *new_ps kernel/power/energy_model.c:681:16: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:699:37: warning: incorrect type in argument 2 (different address spaces) kernel/power/energy_model.c:699:37: expected struct em_perf_state *table kernel/power/energy_model.c:699:37: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:733:38: warning: incorrect type in argument 3 (different address spaces) kernel/power/energy_model.c:733:38: expected struct em_perf_state *table kernel/power/energy_model.c:733:38: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:855:53: warning: dereference of noderef expression kernel/power/energy_model.c:864:32: warning: dereference of noderef expression This is because the __rcu annotation for sparse is only applicable to pointers that need rcu_dereference() or equivalent for protection, which basically means pointers assigned with rcu_assign_pointer(). Make all of the above sparse warnings go away by cleaning up the usage of __rcu and using rcu_dereference_protected() where applicable. Cc: All applicable Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/5885405.DvuYhMxLoT@rjwysocki.net --- include/linux/energy_model.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index b23c8c798dac..ddd09debfc7d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -167,13 +167,13 @@ struct em_data_callback { struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table); + struct em_perf_table *new_table); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); -void em_table_free(struct em_perf_table __rcu *table); +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd); +void em_table_free(struct em_perf_table *table); int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int nr_states); int em_dev_update_chip_binning(struct device *dev); @@ -373,14 +373,14 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) return 0; } static inline -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { return NULL; } -static inline void em_table_free(struct em_perf_table __rcu *table) {} +static inline void em_table_free(struct em_perf_table *table) {} static inline int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { return -EINVAL; } -- cgit v1.2.3 From e1d499590977a492ae120d9263bd55076aabd460 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:29 +0000 Subject: io_uring: introduce struct iou_vec I need a convenient way to pass around and work with iovec+size pair, put them into a structure and makes use of it in rw.c Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d39fadafc9e9047b0a292e5be6db3cf2f48bb1f7.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 35fc241c4672..9101f12d21ef 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -110,6 +110,11 @@ struct io_uring_task { } ____cacheline_aligned_in_smp; }; +struct iou_vec { + struct iovec *iovec; + unsigned nr; +}; + struct io_uring { u32 head; u32 tail; -- cgit v1.2.3 From 9ef4cbbcb4ac3786a1a4164507511b76b2a572c5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:30 +0000 Subject: io_uring: add infra for importing vectored reg buffers Add io_import_reg_vec(), which will be responsible for importing vectored registered buffers. The function might reallocate the vector, but it'd try to do the conversion in place first, which is why it's required of the user to pad the iovec to the right border of the cache. Overlapping also depends on struct iovec being larger than bvec, which is not the case on e.g. 32 bit architectures. Don't try to complicate this case and make sure vectors never overlap, it'll be improved later. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/60bd246b1249476a6996407c1dbc38ef6febad14.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9101f12d21ef..cc84f6e5a64c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -111,8 +111,11 @@ struct io_uring_task { }; struct iou_vec { - struct iovec *iovec; - unsigned nr; + union { + struct iovec *iovec; + struct bio_vec *bvec; + }; + unsigned nr; /* number of struct iovec it can hold */ }; struct io_uring { -- cgit v1.2.3 From 835c4bdf95d5c71fd5b41f77f2343b695b4494aa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:32 +0000 Subject: io_uring/rw: defer reg buf vec import Import registered buffers for vectored reads and writes later at issue time as we now do for other fixed ops. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e8491c976e4ab83a4e3dc428e9fe7555e59583b8.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index cc84f6e5a64c..0e87e292bfb5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -502,6 +502,7 @@ enum { REQ_F_BUFFERS_COMMIT_BIT, REQ_F_BUF_NODE_BIT, REQ_F_HAS_METADATA_BIT, + REQ_F_IMPORT_BUFFER_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -584,6 +585,8 @@ enum { REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), /* request has read/write metadata assigned */ REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), + /* resolve padded iovec to registered buffers */ + REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); -- cgit v1.2.3 From 46723e2839a543859a754b0e6b6f88fa71038b09 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 7 Mar 2025 23:02:23 +0000 Subject: power: supply: core: Remove unused power_supply_set_battery_charged power_supply_set_battery_charged() has been unused since 2019's commit 0f884f8a090e ("ARM: pxa: remove raumfeld board files and defconfig") Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250307230225.128775-2-linux@treblig.org Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 6ed53b292162..5afc5946eb03 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -852,7 +852,6 @@ extern int power_supply_am_i_supplied(struct power_supply *psy); int power_supply_get_property_from_supplier(struct power_supply *psy, enum power_supply_property psp, union power_supply_propval *val); -extern int power_supply_set_battery_charged(struct power_supply *psy); static inline bool power_supply_supports_maintenance_charging(struct power_supply_battery_info *info) -- cgit v1.2.3 From 68b6cf4020726a8be4fa4462e12b16c7fcf8487d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 7 Mar 2025 23:02:25 +0000 Subject: power: supply: Remove unused set_charged method The previous patches in this series removed the only caller and only setter of this method. Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250307230225.128775-4-linux@treblig.org Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 5afc5946eb03..92da3deba342 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -274,7 +274,6 @@ struct power_supply_desc { int (*property_is_writeable)(struct power_supply *psy, enum power_supply_property psp); void (*external_power_changed)(struct power_supply *psy); - void (*set_charged)(struct power_supply *psy); /* * Set if thermal zone should not be created for this power supply. -- cgit v1.2.3 From 134254038739a6c6ecb7548a2f895d89a0dc9d2a Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Tue, 25 Feb 2025 00:21:34 +0100 Subject: power: supply: core: get rid of of_node This removes .of_node from 'struct power_supply', since there is already a copy in .dev.of_node and there is no need to have two copies. Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250225-psy-core-convert-to-fwnode-v1-1-d5e4369936bb@collabora.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 92da3deba342..888824592953 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -315,7 +315,6 @@ struct power_supply { char **supplied_from; size_t num_supplies; - struct device_node *of_node; /* Driver private data */ void *drv_data; -- cgit v1.2.3 From c8be7018d47cfdf36ae6b5bedddca9fc99cd2f0b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 6 Mar 2025 18:45:34 +0000 Subject: net: phylink: Remove unused phylink_init_eee phylink_init_eee() is currently unused. It was last added in 2019 by commit 86e58135bc4a ("net: phylink: add phylink_init_eee() helper") but it didn't actually wire a use up. It had previous been removed in 2017 by commit 939eae25d9a5 ("phylink: remove phylink_init_eee()"). Remove it again. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250306184534.246152-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 08df65f6867a..c187267a15b6 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -714,7 +714,6 @@ void phylink_ethtool_get_pauseparam(struct phylink *, int phylink_ethtool_set_pauseparam(struct phylink *, struct ethtool_pauseparam *); int phylink_get_eee_err(struct phylink *); -int phylink_init_eee(struct phylink *, bool); int phylink_ethtool_get_eee(struct phylink *link, struct ethtool_keee *eee); int phylink_ethtool_set_eee(struct phylink *link, struct ethtool_keee *eee); int phylink_mii_ioctl(struct phylink *, struct ifreq *, int); -- cgit v1.2.3 From 248f6571fd4c51531f7f8f07f186f7ae98a50afc Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 4 Mar 2025 07:50:41 -0800 Subject: netpoll: Optimize skb refilling on critical path netpoll tries to refill the skb queue on every packet send, independently if packets are being consumed from the pool or not. This was particularly problematic while being called from printk(), where the operation would be done while holding the console lock. Introduce a more intelligent approach to skb queue management. Instead of constantly attempting to refill the queue, the system now defers refilling to a work queue and only triggers the workqueue when a buffer is actually dequeued. This change significantly reduces operations with the lock held. Add a work_struct to the netpoll structure for asynchronous refilling, updating find_skb() to schedule refill work only when necessary (skb is dequeued). These changes have demonstrated a 15% reduction in time spent during netpoll_send_msg operations, especially when no SKBs are not consumed from consumed from pool. When SKBs are being dequeued, the improvement is even better, around 70%, mainly because refilling the SKB pool is now happening outside of the critical patch (with console_owner lock held). Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250304-netpoll_refill_v2-v1-1-06e2916a4642@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index f91e50a76efd..f6e8abe0b1f1 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -33,6 +33,7 @@ struct netpoll { u16 local_port, remote_port; u8 remote_mac[ETH_ALEN]; struct sk_buff_head skb_pool; + struct work_struct refill_wq; }; struct netpoll_info { -- cgit v1.2.3 From ed2b548f1017586c44f50654ef9febb42d491f31 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 6 Mar 2025 20:19:09 -0800 Subject: ubsan/overflow: Rework integer overflow sanitizer option to turn on everything Since we're going to approach integer overflow mitigation a type at a time, we need to enable all of the associated sanitizers, and then opt into types one at a time. Rename the existing "signed wrap" sanitizer to just the entire topic area: "integer wrap". Enable the implicit integer truncation sanitizers, with required callbacks and tests. Notably, this requires features (currently) only available in Clang, so we can depend on the cc-option tests to determine availability instead of doing version tests. Link: https://lore.kernel.org/r/20250307041914.937329-1-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/compiler_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index f59393464ea7..4ad3e900bc3d 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -360,7 +360,7 @@ struct ftrace_likely_data { #endif /* Do not trap wrapping arithmetic within an annotated function. */ -#ifdef CONFIG_UBSAN_SIGNED_WRAP +#ifdef CONFIG_UBSAN_INTEGER_WRAP # define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow"))) #else # define __signed_wrap -- cgit v1.2.3 From 4ae89b1fe7c2e37a8f2ea39765e4c40c9d42a101 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 19 Dec 2024 17:28:59 +0000 Subject: capability: Remove unused has_capability The vanilla has_capability() function has been unused since 2018's commit dcb569cf6ac9 ("Smack: ptrace capability use fixes") Remove it. Fixup a comment in security/commoncap.c that referenced it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Paul Moore Signed-off-by: Serge Hallyn --- include/linux/capability.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 0c356a517991..1fb08922552c 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -139,7 +139,6 @@ static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, } #ifdef CONFIG_MULTIUSER -extern bool has_capability(struct task_struct *t, int cap); extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); @@ -150,10 +149,6 @@ extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); #else -static inline bool has_capability(struct task_struct *t, int cap) -{ - return true; -} static inline bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { -- cgit v1.2.3 From cc47f07234f72cbd8e2c973cdbf2a6730660a463 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 27 Feb 2025 17:04:46 +0800 Subject: crypto: lzo - Fix compression buffer overrun Unlike the decompression code, the compression code in LZO never checked for output overruns. It instead assumes that the caller always provides enough buffer space, disregarding the buffer length provided by the caller. Add a safe compression interface that checks for the end of buffer before each write. Use the safe interface in crypto/lzo. Signed-off-by: Herbert Xu Reviewed-by: David Sterba Signed-off-by: Herbert Xu --- include/linux/lzo.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lzo.h b/include/linux/lzo.h index e95c7d1092b2..4d30e3624acd 100644 --- a/include/linux/lzo.h +++ b/include/linux/lzo.h @@ -24,10 +24,18 @@ int lzo1x_1_compress(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len, void *wrkmem); +/* Same as above but does not write more than dst_len to dst. */ +int lzo1x_1_compress_safe(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + /* This requires 'wrkmem' of size LZO1X_1_MEM_COMPRESS */ int lzorle1x_1_compress(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len, void *wrkmem); +/* Same as above but does not write more than dst_len to dst. */ +int lzorle1x_1_compress_safe(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + /* safe decompression with overrun testing */ int lzo1x_decompress_safe(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len); -- cgit v1.2.3 From fcc155008a20fa31b01569e105250490750f0687 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Tue, 4 Mar 2025 16:57:46 +1100 Subject: vsprintf: add simple_strntoul cpio extraction currently does a memcpy to ensure that the archive hex fields are null terminated for simple_strtoul(). simple_strntoul() will allow us to avoid the memcpy. Signed-off-by: David Disseldorp Link: https://lore.kernel.org/r/20250304061020.9815-4-ddiss@suse.de Signed-off-by: Christian Brauner --- include/linux/kstrtox.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h index 7fcf29a4e0de..6ea897222af1 100644 --- a/include/linux/kstrtox.h +++ b/include/linux/kstrtox.h @@ -143,6 +143,7 @@ static inline int __must_check kstrtos32_from_user(const char __user *s, size_t */ extern unsigned long simple_strtoul(const char *,char **,unsigned int); +extern unsigned long simple_strntoul(const char *,char **,unsigned int,size_t); extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); -- cgit v1.2.3 From 0704bf439655c1f82ff7e623b1124a3c1cb8c907 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 3 Mar 2025 12:11:03 +0100 Subject: vdso: Introduce vdso/cache.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vDSO implementation can only include headers from the vdso/ namespace. To enable the usage of ____cacheline_aligned from the vDSO, move it and its dependencies into a new header vdso/cache.h. Keep compatibility by including vdso/cache.h from linux/cache.h. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250303-vdso-clock-v1-1-c1b5c69a166f@linutronix.de --- include/linux/cache.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cache.h b/include/linux/cache.h index ca2a05682a54..e69768f50d53 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -3,16 +3,13 @@ #define __LINUX_CACHE_H #include +#include #include #ifndef L1_CACHE_ALIGN #define L1_CACHE_ALIGN(x) __ALIGN_KERNEL(x, L1_CACHE_BYTES) #endif -#ifndef SMP_CACHE_BYTES -#define SMP_CACHE_BYTES L1_CACHE_BYTES -#endif - /** * SMP_CACHE_ALIGN - align a value to the L2 cacheline size * @x: value to align @@ -63,10 +60,6 @@ #define __ro_after_init __section(".data..ro_after_init") #endif -#ifndef ____cacheline_aligned -#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) -#endif - #ifndef ____cacheline_aligned_in_smp #ifdef CONFIG_SMP #define ____cacheline_aligned_in_smp ____cacheline_aligned -- cgit v1.2.3 From 52132f3a63b33fd38ceef07392ed176db84d579f Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 31 Jan 2025 19:29:50 +0100 Subject: PCI: endpoint: Allow EPF drivers to configure the size of Resizable BARs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A resizable BAR is different from a normal BAR in a few ways: - The minimum size of a resizable BAR is 1 MB. - Each BAR that is resizable has a Capability and Control register in the Resizable BAR Capability structure. These registers contain the supported sizes and the currently selected size of a resizable BAR. The supported sizes is a bitmap of the supported sizes. The selected size is a single value that is equal to one of the supported sizes. A resizable BAR thus has to be configured differently than a BAR_PROGRAMMABLE BAR, which usually sets the BAR size/mask in a vendor specific way. The PCI endpoint framework currently does not support resizable BARs. Add a BAR type BAR_RESIZABLE, so that an EPC driver can support resizable BARs properly. Note that the pci_epc_set_bar() API takes a struct pci_epf_bar which tells the EPC driver how it wants to configure the BAR. struct pci_epf_bar only has a single size struct member. This means that an EPC driver will only be able to set a single supported size. This is perfectly fine, as we do not need the complexity of allowing a host to change the size of the BAR. If someone ever wants to support resizing a resizable BAR, the pci_epc_set_bar() API can be extended in the future. With these changes, we allow an EPF driver to configure the size of Resizable BARs, rather than forcing them to a 1 MB size. Signed-off-by: Niklas Cassel Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20250131182949.465530-10-cassel@kernel.org Signed-off-by: Manivannan Sadhasivam [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński --- include/linux/pci-epc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index e818e3fdcded..91ce39dc0fd4 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -188,11 +188,15 @@ struct pci_epc { * enum pci_epc_bar_type - configurability of endpoint BAR * @BAR_PROGRAMMABLE: The BAR mask can be configured by the EPC. * @BAR_FIXED: The BAR mask is fixed by the hardware. + * @BAR_RESIZABLE: The BAR implements the PCI-SIG Resizable BAR Capability. + * NOTE: An EPC driver can currently only set a single supported + * size. * @BAR_RESERVED: The BAR should not be touched by an EPF driver. */ enum pci_epc_bar_type { BAR_PROGRAMMABLE = 0, BAR_FIXED, + BAR_RESIZABLE, BAR_RESERVED, }; -- cgit v1.2.3 From 4eb208424c9c49dc3298a45e8db7cf43fdf15600 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 31 Jan 2025 19:29:51 +0100 Subject: PCI: endpoint: Add pci_epc_bar_size_to_rebar_cap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper function to convert a size to the representation used by the Resizable BAR Capability Register. Signed-off-by: Niklas Cassel Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20250131182949.465530-11-cassel@kernel.org [mani: squashed the change that added PCIe spec reference to comments from https://lore.kernel.org/linux-pci/20250219171454.2903059-2-cassel@kernel.org] Signed-off-by: Manivannan Sadhasivam Signed-off-by: Krzysztof Wilczyński --- include/linux/pci-epc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 91ce39dc0fd4..713348322dea 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -275,6 +275,7 @@ void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf, enum pci_epc_interface_type type); int pci_epc_write_header(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_header *hdr); +int pci_epc_bar_size_to_rebar_cap(size_t size, u32 *cap); int pci_epc_set_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, struct pci_epf_bar *epf_bar); void pci_epc_clear_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, -- cgit v1.2.3 From 22a01177c30fb4c0ea5e5f9f26473b5ee4660310 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 17 Feb 2025 20:26:46 +0800 Subject: PCI: endpoint: Remove unused devm_pci_epc_destroy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The static function devm_pci_epc_match() is only invoked within the devm_pci_epc_destroy(). However, since it was initially introduced, this new API has had no callers. Thus, remove both the unused API and the static function. Reviewed-by: Manivannan Sadhasivam Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250217-remove_api-v2-1-b169c9117045@quicinc.com Signed-off-by: Manivannan Sadhasivam [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński --- include/linux/pci-epc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 713348322dea..9970ae73c8df 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -261,7 +261,6 @@ __devm_pci_epc_create(struct device *dev, const struct pci_epc_ops *ops, struct pci_epc * __pci_epc_create(struct device *dev, const struct pci_epc_ops *ops, struct module *owner); -void devm_pci_epc_destroy(struct device *dev, struct pci_epc *epc); void pci_epc_destroy(struct pci_epc *epc); int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf, enum pci_epc_interface_type type); -- cgit v1.2.3 From 8ef890df4031121a94407c84659125cbccd3fdbe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 7 Mar 2025 10:30:06 -0800 Subject: net: move misc netdev_lock flavors to a separate header Move the more esoteric helpers for netdev instance lock to a dedicated header. This avoids growing netdevice.h to infinity and makes rebuilding the kernel much faster (after touching the header with the helpers). The main netdev_lock() / netdev_unlock() functions are used in static inlines in netdevice.h and will probably be used most commonly, so keep them in netdevice.h. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250307183006.2312761-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 81 +---------------------------------------------- 1 file changed, 1 insertion(+), 80 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d206c9592b60..9a297757df7e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2630,40 +2630,6 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, f(dev, &dev->_tx[i], arg); } -static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, - const struct lockdep_map *b) -{ - /* Only lower devices currently grab the instance lock, so no - * real ordering issues can occur. In the near future, only - * hardware devices will grab instance lock which also does not - * involve any ordering. Suppress lockdep ordering warnings - * until (if) we start grabbing instance lock on pure SW - * devices (bond/team/veth/etc). - */ - if (a == b) - return 0; - return -1; -} - -#define netdev_lockdep_set_classes(dev) \ -{ \ - static struct lock_class_key qdisc_tx_busylock_key; \ - static struct lock_class_key qdisc_xmit_lock_key; \ - static struct lock_class_key dev_addr_list_lock_key; \ - static struct lock_class_key dev_instance_lock_key; \ - unsigned int i; \ - \ - (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ - lockdep_set_class(&(dev)->addr_list_lock, \ - &dev_addr_list_lock_key); \ - lockdep_set_class(&(dev)->lock, \ - &dev_instance_lock_key); \ - lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ - for (i = 0; i < (dev)->num_tx_queues; i++) \ - lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ - &qdisc_xmit_lock_key); \ -} - u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, @@ -2765,56 +2731,11 @@ static inline void netdev_lock(struct net_device *dev) mutex_lock(&dev->lock); } -static inline bool netdev_trylock(struct net_device *dev) -{ - return mutex_trylock(&dev->lock); -} - static inline void netdev_unlock(struct net_device *dev) { mutex_unlock(&dev->lock); } - -static inline void netdev_assert_locked(struct net_device *dev) -{ - lockdep_assert_held(&dev->lock); -} - -static inline void netdev_assert_locked_or_invisible(struct net_device *dev) -{ - if (dev->reg_state == NETREG_REGISTERED || - dev->reg_state == NETREG_UNREGISTERING) - netdev_assert_locked(dev); -} - -static inline bool netdev_need_ops_lock(struct net_device *dev) -{ - bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; - -#if IS_ENABLED(CONFIG_NET_SHAPER) - ret |= !!dev->netdev_ops->net_shaper_ops; -#endif - - return ret; -} - -static inline void netdev_lock_ops(struct net_device *dev) -{ - if (netdev_need_ops_lock(dev)) - netdev_lock(dev); -} - -static inline void netdev_unlock_ops(struct net_device *dev) -{ - if (netdev_need_ops_lock(dev)) - netdev_unlock(dev); -} - -static inline void netdev_ops_assert_locked(struct net_device *dev) -{ - if (netdev_need_ops_lock(dev)) - lockdep_assert_held(&dev->lock); -} +/* Additional netdev_lock()-related helpers are in net/netdev_lock.h */ void netif_napi_set_irq_locked(struct napi_struct *napi, int irq); -- cgit v1.2.3 From f6f425f3d251c059d1251edd4f37024290d3efca Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Wed, 26 Feb 2025 15:01:05 +0200 Subject: net/mlx5: Add RDMA_CTRL HW capabilities Add RDMA_CTRL UCTX capabilities and add the RDMA_CTRL general object type in hca_cap_2. Reviewed-by: Moshe Shemesh Signed-off-by: Chiara Meiohas Link: https://patch.msgid.link/ef7eb24be9a6f247ab52e8b4480350072e5182f5.1740574103.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cc2875e843f7..3b3d88ffcacc 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1570,6 +1570,8 @@ enum { enum { MLX5_UCTX_CAP_RAW_TX = 1UL << 0, MLX5_UCTX_CAP_INTERNAL_DEV_RES = 1UL << 1, + MLX5_UCTX_CAP_RDMA_CTRL = 1UL << 3, + MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA = 1UL << 4, }; #define MLX5_FC_BULK_SIZE_FACTOR 128 @@ -2140,7 +2142,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 log_min_mkey_entity_size[0x5]; u8 reserved_at_1b0[0x10]; - u8 reserved_at_1c0[0x60]; + u8 general_obj_types_127_64[0x40]; + u8 reserved_at_200[0x20]; u8 reserved_at_220[0x1]; u8 sw_vhca_id_valid[0x1]; @@ -12494,6 +12497,10 @@ enum { MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = BIT_ULL(0x24), }; +enum { + MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL = BIT_ULL(0x13), +}; + enum { MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc, MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13, @@ -12501,6 +12508,7 @@ enum { MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24, MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27, MLX5_GENERAL_OBJECT_TYPES_INT_KEK = 0x47, + MLX5_GENERAL_OBJECT_TYPES_RDMA_CTRL = 0x53, MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS = 0xff15, }; -- cgit v1.2.3 From 0a34fad1bed45ff2245ab8c315bc3d4c6471af46 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Wed, 26 Feb 2025 15:01:06 +0200 Subject: net/mlx5: Allow the throttle mechanism to be more dynamic Previously, throttle commands were identified and limited based on opcode. These commands were limited to half the command slots using a semaphore, and callback commands checked the opcode to determine semaphore release. To allow exceptions, we introduce a variable to indicate when the throttle lock is held. This allows scenarios where throttle commands are not limited. Callback functions use this variable to determine if the throttle semaphore needs to be released. This patch contains no functional changes. It's a preparation for the next patch. Signed-off-by: Chiara Meiohas Reviewed-by: Tariq Toukan Reviewed-by: Moshe Shemesh Link: https://patch.msgid.link/055d975edeb816ac4c0fd1e665c6157d11947d26.1740574103.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index af86097641b0..876d6b03a87a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -989,6 +989,7 @@ struct mlx5_async_work { mlx5_async_cbk_t user_callback; u16 opcode; /* cmd opcode */ u16 op_mod; /* cmd op_mod */ + u8 throttle_locked:1; void *out; /* pointer to the cmd output buffer */ }; -- cgit v1.2.3 From f9deed0980fe29c57488f395528fe3d923c91b1f Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Wed, 26 Feb 2025 15:01:07 +0200 Subject: net/mlx5: Limit non-privileged commands Limit non-privileged UID commands to half of the available command slots when privileged UIDs are present. Privileged throttle commands will not be limited. Use an xarray to store privileged UIDs. Add insert and remove functions for privileged UIDs management. Non-user commands (with uid 0) are not limited. Signed-off-by: Chiara Meiohas Reviewed-by: Moshe Shemesh Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/d2f3dd9a0dbad3c9f2b4bb0723837995e4e06de2.1740574103.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 876d6b03a87a..4f593a61220d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -305,6 +305,8 @@ struct mlx5_cmd { struct semaphore sem; struct semaphore pages_sem; struct semaphore throttle_sem; + struct semaphore unprivileged_sem; + struct xarray privileged_uids; } vars; enum mlx5_cmdif_state state; void *cmd_alloc_buf; @@ -990,6 +992,7 @@ struct mlx5_async_work { u16 opcode; /* cmd opcode */ u16 op_mod; /* cmd op_mod */ u8 throttle_locked:1; + u8 unpriv_locked:1; void *out; /* pointer to the cmd output buffer */ }; @@ -1020,6 +1023,8 @@ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); bool mlx5_cmd_is_down(struct mlx5_core_dev *dev); +int mlx5_cmd_add_privileged_uid(struct mlx5_core_dev *dev, u16 uid); +void mlx5_cmd_remove_privileged_uid(struct mlx5_core_dev *dev, u16 uid); void mlx5_core_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev); void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *mdev); -- cgit v1.2.3 From ab7d228c7e0d0efcac52b81f8514b43985747dc6 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 26 Feb 2025 15:01:08 +0200 Subject: net/mlx5: Query ADV_RDMA capabilities Query ADV_RDMA capabilities which provide information for advanced RDMA related features. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Link: https://patch.msgid.link/e3e6ede03ea31cd201078dcdd4e407608e4a5a87.1740574103.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 5 +++++ include/linux/mlx5/mlx5_ifc.h | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index fd37f4e54d76..0ae6d69c5221 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1251,6 +1251,7 @@ enum mlx5_cap_type { MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, MLX5_CAP_ADV_VIRTUALIZATION = 0x26, + MLX5_CAP_ADV_RDMA = 0x28, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1384,6 +1385,10 @@ enum mlx5_qcam_feature_groups { MLX5_GET(adv_virtualization_cap, \ mdev->caps.hca[MLX5_CAP_ADV_VIRTUALIZATION]->cur, cap) +#define MLX5_CAP_ADV_RDMA(mdev, cap) \ + MLX5_GET(adv_rdma_cap, \ + mdev->caps.hca[MLX5_CAP_ADV_RDMA]->cur, cap) + #define MLX5_CAP_FLOWTABLE_PORT_SELECTION(mdev, cap) \ MLX5_CAP_PORT_SELECTION(mdev, flow_table_properties_port_selection.cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3b3d88ffcacc..fea8af42f954 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1993,7 +1993,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 max_geneve_tlv_options[0x8]; u8 reserved_at_568[0x3]; u8 max_geneve_tlv_option_data_len[0x5]; - u8 reserved_at_570[0x9]; + u8 reserved_at_570[0x1]; + u8 adv_rdma[0x1]; + u8 reserved_at_572[0x7]; u8 adv_virtualization[0x1]; u8 reserved_at_57a[0x6]; @@ -13076,6 +13078,44 @@ struct mlx5_ifc_load_vhca_state_out_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_adv_rdma_cap_bits { + u8 rdma_transport_manager[0x1]; + u8 rdma_transport_manager_other_eswitch[0x1]; + u8 reserved_at_2[0x1e]; + + u8 rcx_type[0x8]; + u8 reserved_at_28[0x2]; + u8 ps_entry_log_max_value[0x6]; + u8 reserved_at_30[0x6]; + u8 qp_max_ps_num_entry[0xa]; + + u8 mp_max_num_queues[0x8]; + u8 ps_user_context_max_log_size[0x8]; + u8 message_based_qp_and_striding_wq[0x8]; + u8 reserved_at_58[0x8]; + + u8 max_receive_send_message_size_stride[0x10]; + u8 reserved_at_70[0x10]; + + u8 max_receive_send_message_size_byte[0x20]; + + u8 reserved_at_a0[0x160]; + + struct mlx5_ifc_flow_table_prop_layout_bits rdma_transport_rx_flow_table_properties; + + struct mlx5_ifc_flow_table_prop_layout_bits rdma_transport_tx_flow_table_properties; + + struct mlx5_ifc_flow_table_fields_supported_2_bits rdma_transport_rx_ft_field_support_2; + + struct mlx5_ifc_flow_table_fields_supported_2_bits rdma_transport_tx_ft_field_support_2; + + struct mlx5_ifc_flow_table_fields_supported_2_bits rdma_transport_rx_ft_field_bitmask_support_2; + + struct mlx5_ifc_flow_table_fields_supported_2_bits rdma_transport_tx_ft_field_bitmask_support_2; + + u8 reserved_at_800[0x3800]; +}; + struct mlx5_ifc_adv_virtualization_cap_bits { u8 reserved_at_0[0x3]; u8 pg_track_log_max_num[0x5]; -- cgit v1.2.3 From 15b103df80b25025040faa8f35164c2595977bdb Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 26 Feb 2025 15:01:09 +0200 Subject: net/mlx5: fs, add RDMA TRANSPORT steering domain support Add RX and TX RDMA_TRANSPORT flow table namespace, and the ability to create flow tables in those namespaces. The RDMA_TRANSPORT RX and TX are per vport. Packets will traverse through RDMA_TRANSPORT_RX after RDMA_RX and through RDMA_TRANSPORT_TX before RDMA_TX, ensuring proper control and management. RDMA_TRANSPORT domains are managed by the vport group manager. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Link: https://patch.msgid.link/a6b550d9859a197eafa804b9a8d76916ca481da9.1740574103.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/fs.h | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0ae6d69c5221..8fe56d0362c6 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1346,6 +1346,12 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_FLOWTABLE_RDMA_TX(mdev, cap) \ MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_transmit_rdma.cap) +#define MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_RX(mdev, cap) \ + MLX5_CAP_ADV_RDMA(mdev, rdma_transport_rx_flow_table_properties.cap) + +#define MLX5_CAP_FLOWTABLE_RDMA_TRANSPORT_TX(mdev, cap) \ + MLX5_CAP_ADV_RDMA(mdev, rdma_transport_tx_flow_table_properties.cap) + #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \ MLX5_GET(flow_table_eswitch_cap, \ mdev->caps.hca[MLX5_CAP_ESWITCH_FLOW_TABLE]->cur, cap) diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 01cb72d68c23..fd62b2b1611d 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -40,6 +40,7 @@ #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) +#define MLX5_RDMA_TRANSPORT_BYPASS_PRIO 0 #define MLX5_FS_MAX_POOL_SIZE BIT(30) enum mlx5_flow_destination_type { @@ -110,6 +111,8 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_RDMA_TX_IPSEC, MLX5_FLOW_NAMESPACE_RDMA_RX_MACSEC, MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC, + MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_RX, + MLX5_FLOW_NAMESPACE_RDMA_TRANSPORT_TX, }; enum { @@ -194,9 +197,9 @@ struct mlx5_flow_namespace * mlx5_get_flow_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type type); struct mlx5_flow_namespace * -mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev, - enum mlx5_flow_namespace_type type, - int vport); +mlx5_get_flow_vport_namespace(struct mlx5_core_dev *dev, + enum mlx5_flow_namespace_type type, + int vport_idx); struct mlx5_flow_table_attr { int prio; @@ -204,6 +207,7 @@ struct mlx5_flow_table_attr { u32 level; u32 flags; u16 uid; + u16 vport; struct mlx5_flow_table *next_ft; struct { -- cgit v1.2.3 From f4e026f454d7bb6aa84901a37641132961054735 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 7 Mar 2025 17:17:15 -0600 Subject: PCI: Fix typos Fix typos and whitespace errors. Link: https://lore.kernel.org/r/20250307231715.438518-1-helgaas@kernel.org Signed-off-by: Bjorn Helgaas --- include/linux/pci-epf.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index ee6156bcbbd0..879d19cebd4f 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -38,7 +38,7 @@ enum pci_barno { * @baseclass_code: broadly classifies the type of function the device performs * @cache_line_size: specifies the system cacheline size in units of DWORDs * @subsys_vendor_id: vendor of the add-in card or subsystem - * @subsys_id: id specific to vendor + * @subsys_id: ID specific to vendor * @interrupt_pin: interrupt pin the device (or device function) uses */ struct pci_epf_header { @@ -59,7 +59,7 @@ struct pci_epf_header { * @bind: ops to perform when a EPC device has been bound to EPF device * @unbind: ops to perform when a binding has been lost between a EPC device * and EPF device - * @add_cfs: ops to initialize function specific configfs attributes + * @add_cfs: ops to initialize function-specific configfs attributes */ struct pci_epf_ops { int (*bind)(struct pci_epf *epf); @@ -138,7 +138,7 @@ struct pci_epf_bar { * @epc: the EPC device to which this EPF device is bound * @epf_pf: the physical EPF device to which this virtual EPF device is bound * @driver: the EPF driver to which this EPF device is bound - * @id: Pointer to the EPF device ID + * @id: pointer to the EPF device ID * @list: to add pci_epf as a list of PCI endpoint functions to pci_epc * @lock: mutex to protect pci_epf_ops * @sec_epc: the secondary EPC device to which this EPF device is bound @@ -151,7 +151,7 @@ struct pci_epf_bar { * @is_vf: true - virtual function, false - physical function * @vfunction_num_map: bitmap to manage virtual function number * @pci_vepf: list of virtual endpoint functions associated with this function - * @event_ops: Callbacks for capturing the EPC events + * @event_ops: callbacks for capturing the EPC events */ struct pci_epf { struct device dev; @@ -185,11 +185,12 @@ struct pci_epf { }; /** - * struct pci_epf_msix_tbl - represents the MSIX table entry structure - * @msg_addr: Writes to this address will trigger MSIX interrupt in host - * @msg_data: Data that should be written to @msg_addr to trigger MSIX interrupt + * struct pci_epf_msix_tbl - represents the MSI-X table entry structure + * @msg_addr: Writes to this address will trigger MSI-X interrupt in host + * @msg_data: Data that should be written to @msg_addr to trigger MSI-X + * interrupt * @vector_ctrl: Identifies if the function is prohibited from sending a message - * using this MSIX table entry + * using this MSI-X table entry */ struct pci_epf_msix_tbl { u64 msg_addr; -- cgit v1.2.3 From 92d2873bedf33974b04530215692705185ec6572 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 3 Mar 2025 08:45:15 +0000 Subject: print: use new #[export] macro for rust_fmt_argument This moves the rust_fmt_argument function over to use the new #[export] macro, which will verify at compile-time that the function signature matches what is in the header file. Reviewed-by: Andreas Hindborg Reviewed-by: Tamir Duberstein Acked-by: Greg Kroah-Hartman Signed-off-by: Alice Ryhl Acked-by: Petr Mladek Link: https://lore.kernel.org/r/20250303-export-macro-v3-4-41fbad85a27f@google.com [ Removed period as requested by Andy. - Miguel ] Signed-off-by: Miguel Ojeda --- include/linux/sprintf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sprintf.h b/include/linux/sprintf.h index 33dcbec71925..51cab2def9ec 100644 --- a/include/linux/sprintf.h +++ b/include/linux/sprintf.h @@ -24,4 +24,7 @@ __scanf(2, 0) int vsscanf(const char *, const char *, va_list); extern bool no_hash_pointers; int no_hash_pointers_enable(char *str); +/* Used for Rust formatting ('%pA') */ +char *rust_fmt_argument(char *buf, char *end, const void *ptr); + #endif /* _LINUX_KERNEL_SPRINTF_H */ -- cgit v1.2.3 From df896e4f7cf5cc3abffb186e2b6815b785500b57 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:02 +0800 Subject: soundwire: extend sdw_stream_type to BPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BPT/BRA need to be special cased, i.e. there's no point in using the bandwidth allocation since the entire frame can be used. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-4-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 2d6c30317792..857b670eb9a5 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -150,12 +150,14 @@ enum sdw_dpn_pkg_mode { * * @SDW_STREAM_PCM: PCM data stream * @SDW_STREAM_PDM: PDM data stream + * @SDW_STREAM_BPT: BPT data stream * * spec doesn't define this, but is used in implementation */ enum sdw_stream_type { SDW_STREAM_PCM = 0, SDW_STREAM_PDM = 1, + SDW_STREAM_BPT = 2, }; /** @@ -879,7 +881,7 @@ struct sdw_port_config { * @ch_count: Channel count of the stream * @bps: Number of bits per audio sample * @direction: Data direction - * @type: Stream type PCM or PDM + * @type: Stream type PCM, PDM or BPT */ struct sdw_stream_config { unsigned int frame_rate; @@ -929,7 +931,7 @@ struct sdw_stream_params { * @name: SoundWire stream name * @params: Stream parameters * @state: Current state of the stream - * @type: Stream type PCM or PDM + * @type: Stream type PCM, PDM or BPT * @m_rt_count: Count of Master runtime(s) in this stream * @master_list: List of Master runtime(s) in this stream. * master_list can contain only one m_rt per Master instance -- cgit v1.2.3 From dc90bbefa792031d89fe2af9ad4a6febd6be96a9 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:03 +0800 Subject: soundwire: stream: extend sdw_alloc_stream() to take 'type' parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the existing definition of sdw_stream_runtime, the 'type' member is never set and defaults to PCM. To prepare for the BPT/BRA support, we need to special-case streams and make use of the 'type'. No functional change for now, the implicit PCM type is now explicit. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-5-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 857b670eb9a5..ba58e2a52322 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1019,7 +1019,7 @@ struct sdw_bus { unsigned int lane_used_bandwidth[SDW_MAX_LANES]; }; -struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name); +struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name, enum sdw_stream_type type); void sdw_release_stream(struct sdw_stream_runtime *stream); int sdw_compute_params(struct sdw_bus *bus, struct sdw_stream_runtime *stream); -- cgit v1.2.3 From 9a756289ac5a8517dc643555d784d830b61576ad Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:06 +0800 Subject: soundwire: bus: add send_async/wait APIs for BPT protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add definitions and helpers for the BPT/BRA protocol. Peripheral drivers (aka ASoC codec drivers) can use this API to send bulk data such as firmware or tables. The design intent is however NOT to directly use this API but to rely on an intermediate regmap layer. The API is only available when no other audio streams have been allocated, and only one BTP/BRA stream is allowed per link. To avoid the addition of yet another lock, the refcount tests are handled in the stream master_runtime alloc/free routines where the bus_lock is already held. Another benefit of this approach is that the same bus_lock is used to handle runtime and port linked lists, which reduces the potential for misaligned configurations. In addition to exclusion with audio streams, BPT transfers have a lot of overhead, specifically registers writes are needed to enable transport in DP0. Most DMAs don't handle too well very small data sets and they may have alignment limitations. The size and alignment requirements are for now not handled by the core but must be checked by platform-specific drivers. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-8-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index ba58e2a52322..69f3e700796d 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -824,6 +824,15 @@ struct sdw_defer { struct completion complete; }; +/* + * Add a practical limit to BPT transfer sizes. BPT is typically used + * to transfer firmware, and larger firmware transfers will increase + * the cold latency beyond typical OS or user requirements. + */ +#define SDW_BPT_MSG_MAX_BYTES (1024 * 1024) + +struct sdw_bpt_msg; + /** * struct sdw_master_ops - Master driver ops * @read_prop: Read Master properties @@ -839,6 +848,10 @@ struct sdw_defer { * @get_device_num: Callback for vendor-specific device_number allocation * @put_device_num: Callback for vendor-specific device_number release * @new_peripheral_assigned: Callback to handle enumeration of new peripheral. + * @bpt_send_async: reserve resources for BPT stream and send message + * using BTP protocol + * @bpt_wait: wait for message completion using BTP protocol + * and release resources */ struct sdw_master_ops { int (*read_prop)(struct sdw_bus *bus); @@ -855,6 +868,9 @@ struct sdw_master_ops { void (*new_peripheral_assigned)(struct sdw_bus *bus, struct sdw_slave *slave, int dev_num); + int (*bpt_send_async)(struct sdw_bus *bus, struct sdw_slave *slave, + struct sdw_bpt_msg *msg); + int (*bpt_wait)(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg); }; int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, @@ -961,6 +977,8 @@ struct sdw_stream_runtime { * @defer_msg: Defer message * @params: Current bus parameters * @stream_refcount: number of streams currently using this bus + * @btp_stream_refcount: number of BTP streams currently using this bus (should + * be zero or one, multiple streams per link is not supported). * @ops: Master callback ops * @port_ops: Master port callback ops * @prop: Master properties @@ -998,6 +1016,7 @@ struct sdw_bus { struct sdw_defer defer_msg; struct sdw_bus_params params; int stream_refcount; + int bpt_stream_refcount; const struct sdw_master_ops *ops; const struct sdw_master_port_ops *port_ops; struct sdw_master_prop prop; @@ -1045,6 +1064,10 @@ int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id); void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id); bool is_clock_scaling_supported_by_slave(struct sdw_slave *slave); +int sdw_bpt_send_async(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg); +int sdw_bpt_wait(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg); +int sdw_bpt_send_sync(struct sdw_bus *bus, struct sdw_slave *slave, struct sdw_bpt_msg *msg); + #if IS_ENABLED(CONFIG_SOUNDWIRE) int sdw_stream_add_slave(struct sdw_slave *slave, -- cgit v1.2.3 From 8e4a239b403bd8aed8787798de8e4e42f79246c2 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:07 +0800 Subject: soundwire: bus: add bpt_stream pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a convenience pointer to the 'sdw_bus' structure. BPT is a dedicated stream which will typically not be handled by DAIs or dailinks. Since there's only one BPT stream per link, storing the pointer at the link level seems rather natural. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-9-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 69f3e700796d..2362f621d94c 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -979,6 +979,7 @@ struct sdw_stream_runtime { * @stream_refcount: number of streams currently using this bus * @btp_stream_refcount: number of BTP streams currently using this bus (should * be zero or one, multiple streams per link is not supported). + * @bpt_stream: pointer stored to handle BTP streams. * @ops: Master callback ops * @port_ops: Master port callback ops * @prop: Master properties @@ -1017,6 +1018,7 @@ struct sdw_bus { struct sdw_bus_params params; int stream_refcount; int bpt_stream_refcount; + struct sdw_stream_runtime *bpt_stream; const struct sdw_master_ops *ops; const struct sdw_master_port_ops *port_ops; struct sdw_master_prop prop; -- cgit v1.2.3 From 7f17a73a7dd8252aa88c6f5e23310861de3d5423 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 27 Feb 2025 22:06:09 +0800 Subject: soundwire: intel_auxdevice: add indirection for BPT send_async/wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror abstraction added for master ops. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Tested-by: shumingf@realtek.com Link: https://lore.kernel.org/r/20250227140615.8147-11-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 580086417e4b..493d9de4e472 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -436,6 +436,10 @@ struct sdw_intel_hw_ops { bool (*sync_check_cmdsync_unlocked)(struct sdw_intel *sdw); void (*program_sdi)(struct sdw_intel *sdw, int dev_num); + + int (*bpt_send_async)(struct sdw_intel *sdw, struct sdw_slave *slave, + struct sdw_bpt_msg *msg); + int (*bpt_wait)(struct sdw_intel *sdw, struct sdw_slave *slave, struct sdw_bpt_msg *msg); }; extern const struct sdw_intel_hw_ops sdw_intel_cnl_hw_ops; -- cgit v1.2.3 From b02d41d884f64d22d5d5a2a4b35550f5e80bdb3d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 6 Mar 2025 01:54:08 +0000 Subject: phy: core: Remove unused phy_pm_runtime_(allow|forbid) phy_pm_runtime_allow() and phy_pm_runtime_forbid() were added in 2013 as part of commit ff764963479a ("drivers: phy: add generic PHY framework") but have remained unused. Remove them. Fix up the (English) docs - I've left the Chinese translation. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250306015408.277729-1-linux@treblig.org Signed-off-by: Vinod Koul --- include/linux/phy/phy.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index 03cd5bae92d3..e63e6e70e860 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -227,8 +227,6 @@ int phy_pm_runtime_get(struct phy *phy); int phy_pm_runtime_get_sync(struct phy *phy); int phy_pm_runtime_put(struct phy *phy); int phy_pm_runtime_put_sync(struct phy *phy); -void phy_pm_runtime_allow(struct phy *phy); -void phy_pm_runtime_forbid(struct phy *phy); int phy_init(struct phy *phy); int phy_exit(struct phy *phy); int phy_power_on(struct phy *phy); @@ -321,16 +319,6 @@ static inline int phy_pm_runtime_put_sync(struct phy *phy) return -ENOSYS; } -static inline void phy_pm_runtime_allow(struct phy *phy) -{ - return; -} - -static inline void phy_pm_runtime_forbid(struct phy *phy) -{ - return; -} - static inline int phy_init(struct phy *phy) { if (!phy) -- cgit v1.2.3 From f550694e88b7b13b647777f889e03e544d9db60c Mon Sep 17 00:00:00 2001 From: Yael Chemla Date: Sun, 9 Mar 2025 20:41:37 +0200 Subject: net/mlx5: Add IFC bits for PPCNT recovery counters group Add recovery counters group layout of PPCNT (Ports Performance Counters Register). This group counts recovery events per link. Also add the corresponding bit in PCAM to indicate this group is supported. Signed-off-by: Yael Chemla Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1741545697-23041-1-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 1 + include/linux/mlx5/mlx5_ifc.h | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 8fe56d0362c6..904804e995aa 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1517,6 +1517,7 @@ enum { MLX5_PHYSICAL_LAYER_COUNTERS_GROUP = 0x12, MLX5_PER_TRAFFIC_CLASS_CONGESTION_GROUP = 0x13, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP = 0x16, + MLX5_PHYSICAL_LAYER_RECOVERY_GROUP = 0x1a, MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP = 0x21, }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fea8af42f954..2c09df4ee574 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2645,6 +2645,12 @@ struct mlx5_ifc_field_select_802_1qau_rp_bits { u8 field_select_8021qaurp[0x20]; }; +struct mlx5_ifc_phys_layer_recovery_cntrs_bits { + u8 total_successful_recovery_events[0x20]; + + u8 reserved_at_20[0x7a0]; +}; + struct mlx5_ifc_phys_layer_cntrs_bits { u8 time_since_last_clear_high[0x20]; @@ -4846,6 +4852,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_ib_ext_port_cntrs_grp_data_layout_bits ib_ext_port_cntrs_grp_data_layout; struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs; + struct mlx5_ifc_phys_layer_recovery_cntrs_bits phys_layer_recovery_cntrs; u8 reserved_at_0[0x7c0]; }; @@ -10584,7 +10591,9 @@ struct mlx5_ifc_mtutc_reg_bits { }; struct mlx5_ifc_pcam_enhanced_features_bits { - u8 reserved_at_0[0x1d]; + u8 reserved_at_0[0x10]; + u8 ppcnt_recovery_counters[0x1]; + u8 reserved_at_11[0xc]; u8 fec_200G_per_lane_in_pplm[0x1]; u8 reserved_at_1e[0x2a]; u8 fec_100G_per_lane_in_pplm[0x1]; -- cgit v1.2.3 From b5198201932635e19068cc2e83a99adf38f32c43 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Thu, 6 Mar 2025 16:05:43 +0900 Subject: counter: Introduce the compare component Compare registers are used in devices to compare a counter channel against a particular count value (e.g. to check if a threshold has been reached). A macro COUNTER_COMP_COMPARE() is introduced to facilitate the creation of compare components as Count extensions. Link: https://lore.kernel.org/r/20250306-introduce-compare-component-v1-1-93993b3dca9c@kernel.org Signed-off-by: William Breathitt Gray --- include/linux/counter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index 426b7d58a438..f208e867dd0f 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -580,6 +580,9 @@ struct counter_array { #define COUNTER_COMP_CEILING(_read, _write) \ COUNTER_COMP_COUNT_U64("ceiling", _read, _write) +#define COUNTER_COMP_COMPARE(_read, _write) \ + COUNTER_COMP_COUNT_U64("compare", _read, _write) + #define COUNTER_COMP_COUNT_MODE(_read, _write, _available) \ { \ .type = COUNTER_COMP_COUNT_MODE, \ -- cgit v1.2.3 From aa4a1d5b198314e9ef02844d0f9426efd5127f74 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 15 Jan 2025 09:53:50 +0100 Subject: irqdomain: Remove extern from function declarations 'extern' is not needed for function declarations. So remove it from irqdomain.h. Note that the declarations are now unified as some had 'extern' and some did not. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250115085409.1629787-2-jirislaby@kernel.org --- include/linux/irqdomain.h | 140 +++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e432b6a12a32..5c0ec335d980 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -350,13 +350,13 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data); -extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, - enum irq_domain_bus_token bus_token); -extern void irq_set_default_host(struct irq_domain *host); -extern struct irq_domain *irq_get_default_host(void); -extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node, - const struct irq_affinity_desc *affinity); +struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token); +void irq_set_default_host(struct irq_domain *host); +struct irq_domain *irq_get_default_host(void); +int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node, + const struct irq_affinity_desc *affinity); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { @@ -370,8 +370,8 @@ static inline bool is_fwnode_irqchip(const struct fwnode_handle *fwnode) return fwnode && fwnode->ops == &irqchip_fwnode_ops; } -extern void irq_domain_update_bus_token(struct irq_domain *domain, - enum irq_domain_bus_token bus_token); +void irq_domain_update_bus_token(struct irq_domain *domain, + enum irq_domain_bus_token bus_token); static inline struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, @@ -454,7 +454,7 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod return IS_ERR(d) ? NULL : d; } -extern unsigned int irq_create_direct_mapping(struct irq_domain *host); +unsigned int irq_create_direct_mapping(struct irq_domain *host); #endif static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, @@ -507,19 +507,19 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw return IS_ERR(d) ? NULL : d; } -extern void irq_domain_remove(struct irq_domain *host); +void irq_domain_remove(struct irq_domain *host); -extern int irq_domain_associate(struct irq_domain *domain, unsigned int irq, - irq_hw_number_t hwirq); -extern void irq_domain_associate_many(struct irq_domain *domain, - unsigned int irq_base, - irq_hw_number_t hwirq_base, int count); +int irq_domain_associate(struct irq_domain *domain, unsigned int irq, + irq_hw_number_t hwirq); +void irq_domain_associate_many(struct irq_domain *domain, + unsigned int irq_base, + irq_hw_number_t hwirq_base, int count); -extern unsigned int irq_create_mapping_affinity(struct irq_domain *host, - irq_hw_number_t hwirq, - const struct irq_affinity_desc *affinity); -extern unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); -extern void irq_dispose_mapping(unsigned int virq); +unsigned int irq_create_mapping_affinity(struct irq_domain *host, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity); +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); +void irq_dispose_mapping(unsigned int virq); static inline unsigned int irq_create_mapping(struct irq_domain *host, irq_hw_number_t hwirq) @@ -527,9 +527,9 @@ static inline unsigned int irq_create_mapping(struct irq_domain *host, return irq_create_mapping_affinity(host, hwirq, NULL); } -extern struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, - irq_hw_number_t hwirq, - unsigned int *irq); +struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq, + unsigned int *irq); static inline struct irq_desc *irq_resolve_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) @@ -587,19 +587,21 @@ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest); int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest); /* V2 interfaces to support hierarchy IRQ domains. */ -extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, - unsigned int virq); -extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, - void *chip_data, irq_flow_handler_t handler, - void *handler_data, const char *handler_name); -extern void irq_domain_reset_irq_data(struct irq_data *irq_data); +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq); +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, + const struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name); +void irq_domain_reset_irq_data(struct irq_data *irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -extern struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, - unsigned int flags, unsigned int size, - struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, void *host_data); +struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, + unsigned int flags, + unsigned int size, + struct fwnode_handle *fwnode, + const struct irq_domain_ops *ops, + void *host_data); static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, unsigned int flags, @@ -613,13 +615,13 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par ops, host_data); } -extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, - unsigned int nr_irqs, int node, void *arg, - bool realloc, - const struct irq_affinity_desc *affinity); -extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); -extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early); -extern void irq_domain_deactivate_irq(struct irq_data *irq_data); +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, + const struct irq_affinity_desc *affinity); +void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); +int irq_domain_activate_irq(struct irq_data *irq_data, bool early); +void irq_domain_deactivate_irq(struct irq_data *irq_data); static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) @@ -628,32 +630,32 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, NULL); } -extern int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg); -extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, - unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, - void *chip_data); -extern void irq_domain_free_irqs_common(struct irq_domain *domain, - unsigned int virq, - unsigned int nr_irqs); -extern void irq_domain_free_irqs_top(struct irq_domain *domain, - unsigned int virq, unsigned int nr_irqs); - -extern int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg); -extern int irq_domain_pop_irq(struct irq_domain *domain, int virq); - -extern int irq_domain_alloc_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg); - -extern void irq_domain_free_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs); - -extern int irq_domain_disconnect_hierarchy(struct irq_domain *domain, +int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg); +int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, + unsigned int virq, + irq_hw_number_t hwirq, + const struct irq_chip *chip, + void *chip_data); +void irq_domain_free_irqs_common(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs); +void irq_domain_free_irqs_top(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs); + +int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg); +int irq_domain_pop_irq(struct irq_domain *domain, int virq); + +int irq_domain_alloc_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg); + +void irq_domain_free_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs); + +int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq); static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) -- cgit v1.2.3 From 827bafd527dde5a6e81421e88fb2144adac1f36c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 9 Mar 2025 19:38:26 +0100 Subject: genirq: Make a few functions static None of these functions are used outside of their source files. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/878qpe2gnx.ffs@tglx --- include/linux/irqdomain.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 5c0ec335d980..33ff41eef8f7 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -630,9 +630,6 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, NULL); } -int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg); int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, -- cgit v1.2.3 From 26f060c106f630e34876c096c1c8997a9e68c371 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Thu, 6 Mar 2025 12:11:02 +0000 Subject: coresight: change coresight_device lock type to raw_spinlock_t coresight_device->cscfg_csdev_lock can be held during __schedule() by perf_event_task_sched_out()/in(). Since coresight->cscfg_csdev_lock type is spinlock_t and perf_event_task_sched_out()/in() is called after acquiring rq_lock, which is raw_spinlock_t (an unsleepable lock), this poses an issue in PREEMPT_RT kernel where spinlock_t is sleepable. To address this, change type of coresight_device->cscfg_csdev_lock from spinlock_t to raw_spinlock_t. Reviewed-by: James Clark Reviewed-by: Mike Leach Signed-off-by: Yeoreum Yun Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250306121110.1647948-2-yeoreum.yun@arm.com --- include/linux/coresight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 89b781e70fe7..4541bfc1cc6b 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -302,7 +302,7 @@ struct coresight_device { /* system configuration and feature lists */ struct list_head feature_csdev_list; struct list_head config_csdev_list; - spinlock_t cscfg_csdev_lock; + raw_spinlock_t cscfg_csdev_lock; void *active_cscfg_ctxt; }; -- cgit v1.2.3 From 4cf364ca57d851e192ce02e98d314d22fa514895 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Thu, 6 Mar 2025 12:11:04 +0000 Subject: coresight: change coresight_trace_id_map's lock type to raw_spinlock_t coresight_trace_id_map->lock can be acquired while coresight devices' drvdata_lock. But the drvdata_lock can be raw_spinlock_t (i.e) coresight-etm4x. To address this, change type of coresight_trace_id_map->lock to raw_spinlock_t Signed-off-by: Yeoreum Yun Reviewed-by: James Clark Reviewed-by: Mike Leach Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250306121110.1647948-4-yeoreum.yun@arm.com --- include/linux/coresight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 4541bfc1cc6b..d79a242b271d 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -239,7 +239,7 @@ struct coresight_trace_id_map { DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX); atomic_t __percpu *cpu_map; atomic_t perf_cs_etm_session_active; - spinlock_t lock; + raw_spinlock_t lock; }; /** -- cgit v1.2.3 From febaa65c94e0db795ec52c45bf7ede524cdce63c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 8 Jan 2025 10:04:35 +0100 Subject: module: Use RCU in find_module_all(). The modules list and module::kallsyms can be accessed under RCU assumption. Remove module_assert_mutex_or_preempt() from find_module_all() so it can be used under RCU protection without warnings. Update its callers to use RCU protection instead of preempt_disable(). Cc: Jiri Kosina Cc: Joe Lawrence Cc: Josh Poimboeuf Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Miroslav Benes Cc: Petr Mladek Cc: Steven Rostedt Cc: linux-trace-kernel@vger.kernel.org Cc: live-patching@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250108090457.512198-7-bigeasy@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 30e5b19bafa9..0516f5ea9153 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -666,7 +666,7 @@ static inline bool within_module(unsigned long addr, const struct module *mod) return within_module_init(addr, mod) || within_module_core(addr, mod); } -/* Search for module by name: must be in a RCU-sched critical section. */ +/* Search for module by name: must be in a RCU critical section. */ struct module *find_module(const char *name); extern void __noreturn __module_put_and_kthread_exit(struct module *mod, -- cgit v1.2.3 From 6593a2c990f251bce63ab8de0397405303e32381 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 8 Jan 2025 10:04:43 +0100 Subject: module: Use RCU in all users of __module_address(). __module_address() can be invoked within a RCU section, there is no requirement to have preemption disabled. Replace the preempt_disable() section around __module_address() with RCU. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250108090457.512198-15-bigeasy@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/kallsyms.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 1c6a6c1704d8..d5dd54c53ace 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -55,12 +55,11 @@ static inline void *dereference_symbol_descriptor(void *ptr) if (is_ksym_addr((unsigned long)ptr)) return ptr; - preempt_disable(); + guard(rcu)(); mod = __module_address((unsigned long)ptr); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); - preempt_enable(); #endif return ptr; } -- cgit v1.2.3 From dfd500d89545a1c23554e9d9d62d112c44b4c82e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 4 Feb 2025 13:33:36 +1030 Subject: fs: nfs: acl: Avoid -Wflex-array-member-not-at-end warning -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. So, in order to avoid ending up with a flexible-array member in the middle of other structs, we use the `struct_group_tagged()` helper to create a new tagged `struct posix_acl_hdr`. This structure groups together all the members of the flexible `struct posix_acl` except the flexible array. As a result, the array is effectively separated from the rest of the members without modifying the memory layout of the flexible structure. We then change the type of the middle struct member currently causing trouble from `struct posix_acl` to `struct posix_acl_hdr`. We also want to ensure that when new members need to be added to the flexible structure, they are always included within the newly created tagged struct. For this, we use `static_assert()`. This ensures that the memory layout for both the flexible structure and the new tagged struct is the same after any changes. This approach avoids having to implement `struct posix_acl_hdr` as a completely separate structure, thus preventing having to maintain two independent but basically identical structures, closing the door to potential bugs in the future. We also use `container_of()` whenever we need to retrieve a pointer to the flexible structure, through which we can access the flexible-array member, if necessary. So, with these changes, fix the following warning: fs/nfs_common/nfsacl.c:45:26: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Acked-by: Anna Schumaker Acked-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/posix_acl.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index e2d47eb1a7f3..62d497763e25 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -27,11 +27,16 @@ struct posix_acl_entry { }; struct posix_acl { - refcount_t a_refcount; - unsigned int a_count; - struct rcu_head a_rcu; + /* New members MUST be added within the struct_group() macro below. */ + struct_group_tagged(posix_acl_hdr, hdr, + refcount_t a_refcount; + unsigned int a_count; + struct rcu_head a_rcu; + ); struct posix_acl_entry a_entries[] __counted_by(a_count); }; +static_assert(offsetof(struct posix_acl, a_entries) == sizeof(struct posix_acl_hdr), + "struct member likely outside of struct_group_tagged()"); #define FOREACH_ACL_ENTRY(pa, acl, pe) \ for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa Date: Mon, 10 Feb 2025 11:25:53 -0500 Subject: nfsd: disallow file locking and delegations for NFSv4 reexport We do not and cannot support file locking with NFS reexport over NFSv4.x for the same reason we don't do it for NFSv3: NFS reexport server reboot cannot allow clients to recover locks because the source NFS server has not rebooted, and so it is not in grace. Since the source NFS server is not in grace, it cannot offer any guarantees that the file won't have been changed between the locks getting lost and any attempt to recover/reclaim them. The same applies to delegations and any associated locks, so disallow them too. Clients are no longer allowed to get file locks or delegations from a reexport server, any attempts will fail with operation not supported. Update the "Reboot recovery" section accordingly in Documentation/filesystems/nfs/reexport.rst Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/exportfs.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index a087606ace19..fc93f0abf513 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -279,10 +279,22 @@ struct export_operations { atomic attribute updates */ #define EXPORT_OP_FLUSH_ON_CLOSE (0x20) /* fs flushes file data on close */ -#define EXPORT_OP_ASYNC_LOCK (0x40) /* fs can do async lock request */ +#define EXPORT_OP_NOLOCKS (0x40) /* no file locking support */ unsigned long flags; }; +/** + * exportfs_cannot_lock() - check if export implements file locking + * @export_ops: the nfs export operations to check + * + * Returns true if the export does not support file locking. + */ +static inline bool +exportfs_cannot_lock(const struct export_operations *export_ops) +{ + return export_ops->flags & EXPORT_OP_NOLOCKS; +} + extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent, int flags); -- cgit v1.2.3 From 1bf70d08cc3b55abd1763e6dff5855cb8dd8318b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:33 +0530 Subject: block: introduce a dedicated lock for protecting queue elevator updates A queue's elevator can be updated either when modifying nr_hw_queues or through the sysfs scheduler attribute. Currently, elevator switching/ updating is protected using q->sysfs_lock, but this has led to lockdep splats[1] due to inconsistent lock ordering between q->sysfs_lock and the freeze-lock in multiple block layer call sites. As the scope of q->sysfs_lock is not well-defined, its (mis)use has resulted in numerous lockdep warnings. To address this, introduce a new q->elevator_lock, dedicated specifically for protecting elevator switches/updates. And we'd now use this new q->elevator_lock instead of q->sysfs_lock for protecting elevator switches/updates. While at it, make elv_iosched_load_module() a static function, as it is only called from elv_iosched_store(). Also, remove redundant parameters from elv_iosched_load_module() function signature. [1] https://lore.kernel.org/all/67637e70.050a0220.3157ee.000c.GAE@google.com/ Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-5-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c..31b1b635c710 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -560,6 +560,14 @@ struct request_queue { struct blk_flush_queue *fq; struct list_head flush_list; + /* + * Protects against I/O scheduler switching, specifically when + * updating q->elevator. To ensure proper locking order during + * an elevator update, first freeze the queue, then acquire + * ->elevator_lock. + */ + struct mutex elevator_lock; + struct mutex sysfs_lock; struct mutex limits_lock; -- cgit v1.2.3 From 3efe7571c3ae2b6481253a2616c2bb3fbadd503b Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:34 +0530 Subject: block: protect nr_requests update using q->elevator_lock The sysfs attribute nr_requests could be simultaneously updated from elevator switch/update or nr_hw_queue update code path. The update to nr_requests for each of those code paths runs holding q->elevator_lock. So we should protect access to sysfs attribute nr_requests using q-> elevator_lock instead of q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-6-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 31b1b635c710..3e66ad016a23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,10 +561,12 @@ struct request_queue { struct list_head flush_list; /* - * Protects against I/O scheduler switching, specifically when - * updating q->elevator. To ensure proper locking order during - * an elevator update, first freeze the queue, then acquire - * ->elevator_lock. + * Protects against I/O scheduler switching, particularly when + * updating q->elevator. Since the elevator update code path may + * also modify q->nr_requests, this lock also protects the sysfs + * attribute nr_requests. + * To ensure proper locking order during an elevator update, first + * freeze the queue, then acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From 245618f8e45ff4f79327627b474b563da71c2c75 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:35 +0530 Subject: block: protect wbt_lat_usec using q->elevator_lock The wbt latency and state could be updated while initializing the elevator or exiting the elevator. It could be also updated while configuring IO latency QoS parameters using cgroup. The elevator code path is now protected with q->elevator_lock. So we should protect the access to sysfs attribute wbt_lat_usec using q->elevator _lock instead of q->sysfs_lock. White we're at it, also protect ioc_qos_write(), which configures wbt parameters via cgroup, using q->elevator_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-7-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3e66ad016a23..0ee3b5c9388e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -563,8 +563,8 @@ struct request_queue { /* * Protects against I/O scheduler switching, particularly when * updating q->elevator. Since the elevator update code path may - * also modify q->nr_requests, this lock also protects the sysfs - * attribute nr_requests. + * also modify q->nr_requests and wbt latency, this lock also + * protects the sysfs attributes nr_requests and wbt_lat_usec. * To ensure proper locking order during an elevator update, first * freeze the queue, then acquire ->elevator_lock. */ -- cgit v1.2.3 From 5e40f4452dc9a3fb44d13bb6bc7032f3911a2675 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 4 Mar 2025 15:52:36 +0530 Subject: block: protect read_ahead_kb using q->limits_lock The bdi->ra_pages could be updated under q->limits_lock because it's usually calculated from the queue limits by queue_limits_commit_update. So protect reading/writing the sysfs attribute read_ahead_kb using q->limits_lock instead of q->sysfs_lock. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250304102551.2533767-8-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0ee3b5c9388e..3bee1b4858b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,9 @@ struct request_queue { struct mutex elevator_lock; struct mutex sysfs_lock; + /* + * Protects queue limits and also sysfs attribute read_ahead_kb. + */ struct mutex limits_lock; /* -- cgit v1.2.3 From 5abba4cebec0a591ca7e7f55701e42cd5dc059af Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 6 Mar 2025 15:09:53 +0530 Subject: block: protect hctx attributes/params using q->elevator_lock Currently, hctx attributes (nr_tags, nr_reserved_tags, and cpu_list) are protected using `q->sysfs_lock`. However, these attributes can be updated in multiple scenarios: - During the driver's probe method. - When updating nr_hw_queues. - When writing to the sysfs attribute nr_requests, which can modify nr_tags. The nr_requests attribute is already protected using q->elevator_lock, but none of the update paths actually use q->sysfs_lock to protect hctx attributes. So to ensure proper synchronization, replace q->sysfs_lock with q->elevator_lock when reading hctx attributes through sysfs. Additionally, blk_mq_update_nr_hw_queues allocates and updates hctx. The allocation of hctx is protected using q->elevator_lock, however, updating hctx params happens without any protection, so safeguard hctx param update path by also using q->elevator_lock. Signed-off-by: Nilay Shroff Link: https://lore.kernel.org/r/20250306093956.2818808-1-nilay@linux.ibm.com [axboe: wrap comment at 80 chars] Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3bee1b4858b6..dcf8fce15e23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,12 +561,14 @@ struct request_queue { struct list_head flush_list; /* - * Protects against I/O scheduler switching, particularly when - * updating q->elevator. Since the elevator update code path may - * also modify q->nr_requests and wbt latency, this lock also - * protects the sysfs attributes nr_requests and wbt_lat_usec. - * To ensure proper locking order during an elevator update, first - * freeze the queue, then acquire ->elevator_lock. + * Protects against I/O scheduler switching, particularly when updating + * q->elevator. Since the elevator update code path may also modify q-> + * nr_requests and wbt latency, this lock also protects the sysfs attrs + * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update + * may modify hctx tags, reserved-tags and cpumask, so this lock also + * helps protect the hctx attrs. To ensure proper locking order during + * an elevator or nr_hw_queue update, first freeze the queue, then + * acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From f3e5fe4adfb837cf68da2a25c5c8c1ef2d0c4d78 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Mar 2025 14:40:52 -0800 Subject: lib/crc7: unexport crc7_be_syndrome_table Since neither crc7_be_syndrome_table nor crc7_be_byte() are used outside lib/crc7.c, fold them into lib/crc7.c. Link: https://lore.kernel.org/r/20250304224052.157915-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc7.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc7.h b/include/linux/crc7.h index b462842f3c32..61d34749e437 100644 --- a/include/linux/crc7.h +++ b/include/linux/crc7.h @@ -3,13 +3,6 @@ #define _LINUX_CRC7_H #include -extern const u8 crc7_be_syndrome_table[256]; - -static inline u8 crc7_be_byte(u8 crc, u8 data) -{ - return crc7_be_syndrome_table[crc ^ data]; -} - extern u8 crc7_be(u8 crc, const u8 *buffer, size_t len); #endif -- cgit v1.2.3 From 536da2a440b58d4f9aed963364a2921a41bec03d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Feb 2025 13:34:45 +0000 Subject: gfs2: Convert gfs2_end_log_write_bh() to work on a folio gfs2_end_log_write() has to handle bios which consist of both pages which belong to folios and pages which were allocated from a mempool and do not belong to a folio. It would be cleaner to have separate endio handlers which handle each type, but it's not clear to me whether that's even possible. This patch is slightly forward-looking in that page_folio() cannot currently return NULL, but it will return NULL in the future for pages which do not belong to a folio. This was the last user of page_has_buffers(), so remove it. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andreas Gruenbacher --- include/linux/buffer_head.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 932139c5d46f..fab70b26e131 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -182,7 +182,6 @@ static inline unsigned long bh_offset(const struct buffer_head *bh) BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) -#define page_has_buffers(page) PagePrivate(page) #define folio_buffers(folio) folio_get_private(folio) void buffer_check_dirty_writeback(struct folio *folio, -- cgit v1.2.3 From a7eb9124d92be1330afd18c98e4f28f297b50cb8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 14 Feb 2025 18:03:01 -0600 Subject: PCI: Cache offset of Resizable BAR capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously most resizable BAR interfaces (pci_rebar_get_possible_sizes(), pci_rebar_set_size(), etc) as well as pci_restore_state() searched config space for a Resizable BAR capability. Most devices don't have such a capability, so this is wasted effort, especially for pci_restore_state(). Search for a Resizable BAR capability once at enumeration-time and cache the offset so we don't have to search every time we need it. No functional change intended. Link: https://lore.kernel.org/r/20250215000301.175097-3-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Reviewed-by: Ilpo Järvinen --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 47b31ad724fa..9e5bbd996c83 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -353,6 +353,7 @@ struct pci_dev { struct pci_dev *rcec; /* Associated RCEC device */ #endif u32 devcap; /* PCIe Device Capabilities */ + u16 rebar_cap; /* Resizable BAR capability offset */ u8 pcie_cap; /* PCIe capability offset */ u8 msi_cap; /* MSI capability offset */ u8 msix_cap; /* MSI-X capability offset */ -- cgit v1.2.3 From 366fef794bd2b7c2e9df933f6828dd9739bfba84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 10 Mar 2025 14:21:58 +0200 Subject: : Allow the passing of both iomem and non-iomem pointers to no_free_ptr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling no_free_ptr() for an __iomem pointer results in Sparse complaining about the types: warning: incorrect type in argument 1 (different address spaces) expected void const volatile *val got void [noderef] __iomem *__val [ The example is from drivers/platform/x86/intel/pmc/core_ssram.c:283 ] The problem is caused by the signature of __must_check_fn() added in: 85be6d842447 ("cleanup: Make no_free_ptr() __must_check") ... to enforce that the return value is always used. Use __force to allow both iomem and non-iomem pointers to be given for no_free_ptr(). Reported-by: kernel test robot Signed-off-by: Ilpo Järvinen Signed-off-by: Ingo Molnar Reviewed-by: Andy Shevchenko Reviewed-by: Dan Williams Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250310122158.20966-1-ilpo.jarvinen@linux.intel.com Closes: https://lore.kernel.org/oe-kbuild-all/202403050547.qnZtuNlN-lkp@intel.com/ --- include/linux/cleanup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index ec00e3f7af2b..ee2614adb785 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -212,7 +212,7 @@ const volatile void * __must_check_fn(const volatile void *val) { return val; } #define no_free_ptr(p) \ - ((typeof(p)) __must_check_fn(__get_and_null(p, NULL))) + ((typeof(p)) __must_check_fn((__force const volatile void *)__get_and_null(p, NULL))) #define return_ptr(p) return no_free_ptr(p) -- cgit v1.2.3 From 9da4f4f9877ecb006e73f38e92e22d10e989b00d Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 10 Mar 2025 10:24:10 -0400 Subject: lsm: remove old email address for Stephen Smalley Remove my old, no longer functioning, email address from comments. Could alternatively replace with my current email but seems redundant with MAINTAINERS and prone to being out of date. Signed-off-by: Stephen Smalley [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index e13d2f947b51..7283bc4cf413 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -5,7 +5,7 @@ * * Author : Etienne BASSET * - * All credits to : Stephen Smalley, + * All credits to : Stephen Smalley * All BUGS to : Etienne BASSET */ #ifndef _LSM_COMMON_LOGGING_ -- cgit v1.2.3 From a18dfa9925b9ef6107ea3aa5814ca3c704d34a8a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 6 Mar 2025 22:34:09 -0500 Subject: ipv6: save dontfrag in cork When spanning datagram construction over multiple send calls using MSG_MORE, per datagram settings are configured on the first send. That is when ip(6)_setup_cork stores these settings for subsequent use in __ip(6)_append_data and others. The only flag that escaped this was dontfrag. As a result, a datagram could be constructed with df=0 on the first sendmsg, but df=1 on a next. Which is what cmsg_ip.sh does in an upcoming MSG_MORE test in the "diff" scenario. Changing datagram conditions in the middle of constructing an skb makes this already complex code path even more convoluted. It is here unintentional. Bring this flag in line with expected sockopt/cmsg behavior. And stop passing ipc6 to __ip6_append_data, to avoid such issues in the future. This is already the case for __ip_append_data. inet6_cork had a 6 byte hole, so the 1B flag has no impact. Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250307033620.411611-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a6e2aadbb91b..5aeeed22f35b 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -207,6 +207,7 @@ struct inet6_cork { struct ipv6_txoptions *opt; u8 hop_limit; u8 tclass; + u8 dontfrag:1; }; /* struct ipv6_pinfo - ipv6 private area */ -- cgit v1.2.3 From b9014a10bdb8e967d85819f5ef61e6f80d0b46c9 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 10 Feb 2025 17:29:11 -0600 Subject: dmaengine: Remove device_prep_dma_imm_data from struct dma_device The device_prep_dma_imm_data() method isn't implemented or invoked by any code since commit 80ade22c06ca ("misc: mic: remove the MIC drivers"). Remove it, shrinking struct dma_device by a few bytes. Signed-off-by: Nathan Lynch Link: https://lore.kernel.org/r/20250210-dmaengine-drop-imm-data-v1-1-e017766da2fa@amd.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index a360e0330436..bb146c5ac3e4 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -839,7 +839,6 @@ struct dma_filter { * The function takes a buffer of size buf_len. The callback function will * be called after period_len bytes have been transferred. * @device_prep_interleaved_dma: Transfer expression in a generic way. - * @device_prep_dma_imm_data: DMA's 8 byte immediate data to the dst address * @device_caps: May be used to override the generic DMA slave capabilities * with per-channel specific ones * @device_config: Pushes a new configuration to a channel, return 0 or an error @@ -942,9 +941,6 @@ struct dma_device { struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)( struct dma_chan *chan, struct dma_interleaved_template *xt, unsigned long flags); - struct dma_async_tx_descriptor *(*device_prep_dma_imm_data)( - struct dma_chan *chan, dma_addr_t dst, u64 data, - unsigned long flags); void (*device_caps)(struct dma_chan *chan, struct dma_slave_caps *caps); int (*device_config)(struct dma_chan *chan, struct dma_slave_config *config); -- cgit v1.2.3 From e9cec4487cb789645a8c84b13a9ce54c2d89e3bb Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Wed, 26 Feb 2025 16:59:01 -0300 Subject: printk: Rename suspend_console to console_suspend_all The function suspend_console has a misleading name, since it suspends all consoles, so rename it accordingly. Signed-off-by: Marcos Paulo de Souza Reviewed-by: Petr Mladek Reviewed-by: John Ogness Link: https://lore.kernel.org/r/20250226-printk-renaming-v1-1-0b878577f2e6@suse.com [pmladek@suse.com: Fixed typo in the commit message.] Signed-off-by: Petr Mladek --- include/linux/console.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index eba367bf605d..fa5941f4f9c5 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -648,7 +648,7 @@ static inline void console_sysfs_notify(void) extern bool console_suspend_enabled; /* Suspend and resume console messages over PM events */ -extern void suspend_console(void); +extern void console_suspend_all(void); extern void resume_console(void); int mda_console_init(void); -- cgit v1.2.3 From 63830aef74188354806ea3c9043dd3929c6e47f3 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Wed, 26 Feb 2025 16:59:02 -0300 Subject: printk: Rename resume_console to console_resume_all The function resume_console has a misleading name, since it resumes all consoles, so rename it accordingly. Signed-off-by: Marcos Paulo de Souza Reviewed-by: Petr Mladek Reviewed-by: John Ogness Link: https://lore.kernel.org/r/20250226-printk-renaming-v1-2-0b878577f2e6@suse.com [pmladek@suse.com: Fixed typo in the commit message.] Signed-off-by: Petr Mladek --- include/linux/console.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index fa5941f4f9c5..0d48e0deb213 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -649,7 +649,7 @@ extern bool console_suspend_enabled; /* Suspend and resume console messages over PM events */ extern void console_suspend_all(void); -extern void resume_console(void); +extern void console_resume_all(void); int mda_console_init(void); -- cgit v1.2.3 From 242fafe3faa761ccc27dc2ebb978ca1ec04adc25 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Wed, 26 Feb 2025 16:59:03 -0300 Subject: printk: Rename console_stop to console_suspend The intent of console_stop was in fact to suspend it, so rename the function accordingly. Signed-off-by: Marcos Paulo de Souza Reviewed-by: Petr Mladek Reviewed-by: John Ogness Link: https://lore.kernel.org/r/20250226-printk-renaming-v1-3-0b878577f2e6@suse.com [pmladek@suse.com: Fixed typo in the commit message. Updated also new drm_log.c] Signed-off-by: Petr Mladek --- include/linux/console.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 0d48e0deb213..74587eeea3c7 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -633,7 +633,7 @@ extern void console_conditional_schedule(void); extern void console_unblank(void); extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); -extern void console_stop(struct console *); +extern void console_suspend(struct console *); extern void console_start(struct console *); extern int is_console_locked(void); extern int braille_register_console(struct console *, int index, -- cgit v1.2.3 From 5395e09c803e20ea0713eaa3a44bc8dd36a009b7 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Wed, 26 Feb 2025 16:59:04 -0300 Subject: printk: Rename console_start to console_resume The intent of console_start was to resume a previously suspended console, so rename it accordingly. Signed-off-by: Marcos Paulo de Souza Reviewed-by: Petr Mladek Reviewed-by: John Ogness Link: https://lore.kernel.org/r/20250226-printk-renaming-v1-4-0b878577f2e6@suse.com [pmladek@suse.com: Fixed typo in the commit message. Updated also new drm_log.c.] Signed-off-by: Petr Mladek --- include/linux/console.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 74587eeea3c7..8f10d0a85bb4 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -634,7 +634,7 @@ extern void console_unblank(void); extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_suspend(struct console *); -extern void console_start(struct console *); +extern void console_resume(struct console *); extern int is_console_locked(void); extern int braille_register_console(struct console *, int index, char *console_options, char *braille_options); -- cgit v1.2.3 From 29c6e1c2b923b43e8082bba5c6675185a8fe305a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 27 Feb 2025 14:47:47 +0000 Subject: iommu: Unexport iommu_fwspec_free() The drivers doing their own fwspec parsing have no need to call iommu_fwspec_free() since fwspecs were moved into dev_iommu, as returning an error from .probe_device will tear down the whole lot anyway. Move it into the private interface now that it only serves for of_iommu to clean up in an error case. I have no idea what mtk_v1 was doing in effectively guaranteeing a NULL fwspec would be dereferenced if no "iommus" DT property was found, so add a check for that to at least make the code look sane. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/36e245489361de2d13db22a510fa5c79e7126278.1740667667.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e93d2e918599..cf8c16ba04a0 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1099,7 +1099,6 @@ struct iommu_mm_data { }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); -void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) @@ -1410,10 +1409,6 @@ static inline int iommu_fwspec_init(struct device *dev, return -ENODEV; } -static inline void iommu_fwspec_free(struct device *dev) -{ -} - static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) { -- cgit v1.2.3 From c380931712d16e23f6aa90703f438330139e9731 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 27 Feb 2025 14:41:48 +0000 Subject: dma: Fix encryption bit clearing for dma_to_phys phys_to_dma() sets the encryption bit on the translated DMA address. But dma_to_phys() clears the encryption bit after it has been translated back to the physical address, which could fail if the device uses DMA ranges. AMD SME doesn't use the DMA ranges and thus this is harmless. But as we are about to add support for other architectures, let us fix this. Reported-by: Aneesh Kumar K.V Link: https://lkml.kernel.org/r/yq5amsen9stc.fsf@kernel.org Cc: Will Deacon Cc: Jean-Philippe Brucker Cc: Robin Murphy Cc: Steven Price Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Tom Lendacky Reviewed-by: Robin Murphy Acked-by: Tom Lendacky Signed-off-by: Suzuki K Poulose Reviewed-by: Gavin Shan Acked-by: Marek Szyprowski Fixes: 42be24a4178f ("arm64: Enable memory encrypt for Realms") Acked-by: Will Deacon Link: https://lore.kernel.org/r/20250227144150.1667735-2-suzuki.poulose@arm.com Signed-off-by: Catalin Marinas --- include/linux/dma-direct.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index d7e30d4f7503..d20ecc24cb0f 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -101,12 +101,13 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; + dma_addr = __sme_clr(dma_addr); if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else paddr = dma_addr; - return __sme_clr(paddr); + return paddr; } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ -- cgit v1.2.3 From b66e2ee7b6c8d45bbe4b6f6885ee27511506812c Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 27 Feb 2025 14:41:49 +0000 Subject: dma: Introduce generic dma_addr_*crypted helpers AMD SME added __sme_set/__sme_clr primitives to modify the DMA address for encrypted/decrypted traffic. However this doesn't fit in with other models, e.g., Arm CCA where the meanings are the opposite. i.e., "decrypted" traffic has a bit set and "encrypted" traffic has the top bit cleared. In preparation for adding the support for Arm CCA DMA conversions, convert the existing primitives to more generic ones that can be provided by the backends. i.e., add helpers to 1. dma_addr_encrypted - Convert a DMA address to "encrypted" [ == __sme_set() ] 2. dma_addr_unencrypted - Convert a DMA address to "decrypted" [ None exists today ] 3. dma_addr_canonical - Clear any "encryption"/"decryption" bits from DMA address [ SME uses __sme_clr() ] and convert to a canonical DMA address. Since the original __sme_xxx helpers come from linux/mem_encrypt.h, use that as the home for the new definitions and provide dummy ones when none is provided by the architectures. With the above, phys_to_dma_unencrypted() uses the newly added dma_addr_unencrypted() helper and to make it a bit more easier to read and avoid double conversion, provide __phys_to_dma(). Suggested-by: Robin Murphy Cc: Will Deacon Cc: Jean-Philippe Brucker Cc: Robin Murphy Cc: Steven Price Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Tom Lendacky Cc: Aneesh Kumar K.V Signed-off-by: Suzuki K Poulose Reviewed-by: Robin Murphy Reviewed-by: Gavin Shan Acked-by: Marek Szyprowski Fixes: 42be24a4178f ("arm64: Enable memory encrypt for Realms") Acked-by: Will Deacon Link: https://lore.kernel.org/r/20250227144150.1667735-3-suzuki.poulose@arm.com Signed-off-by: Catalin Marinas --- include/linux/dma-direct.h | 12 ++++++++---- include/linux/mem_encrypt.h | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index d20ecc24cb0f..f3bc0bcd7098 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -78,14 +78,18 @@ static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map) #define phys_to_dma_unencrypted phys_to_dma #endif #else -static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, - phys_addr_t paddr) +static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { if (dev->dma_range_map) return translate_phys_to_dma(dev, paddr); return paddr; } +static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, + phys_addr_t paddr) +{ + return dma_addr_unencrypted(__phys_to_dma(dev, paddr)); +} /* * If memory encryption is supported, phys_to_dma will set the memory encryption * bit in the DMA address, and dma_to_phys will clear it. @@ -94,14 +98,14 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, */ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return __sme_set(phys_to_dma_unencrypted(dev, paddr)); + return dma_addr_encrypted(__phys_to_dma(dev, paddr)); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; - dma_addr = __sme_clr(dma_addr); + dma_addr = dma_addr_canonical(dma_addr); if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index ae4526389261..07584c5e36fb 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -26,11 +26,34 @@ */ #define __sme_set(x) ((x) | sme_me_mask) #define __sme_clr(x) ((x) & ~sme_me_mask) + +#define dma_addr_encrypted(x) __sme_set(x) +#define dma_addr_canonical(x) __sme_clr(x) + #else #define __sme_set(x) (x) #define __sme_clr(x) (x) #endif +/* + * dma_addr_encrypted() and dma_addr_unencrypted() are for converting a given DMA + * address to the respective type of addressing. + * + * dma_addr_canonical() is used to reverse any conversions for encrypted/decrypted + * back to the canonical address. + */ +#ifndef dma_addr_encrypted +#define dma_addr_encrypted(x) (x) +#endif + +#ifndef dma_addr_unencrypted +#define dma_addr_unencrypted(x) (x) +#endif + +#ifndef dma_addr_canonical +#define dma_addr_canonical(x) (x) +#endif + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ -- cgit v1.2.3 From 6bc022653d514bd898f23c5f3e48773438e3d5dd Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 29 Jan 2025 21:43:35 +0000 Subject: mmc: slot-gpio: Remove unused mmc_gpio_set_cd_isr mmc_gpio_set_cd_isr() last use was removed in 2018 by commit 7838a8ddc80b ("mmc: omap_hsmmc: Kill off cover detection") Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250129214335.125292-1-linux@treblig.org Signed-off-by: Ulf Hansson --- include/linux/mmc/slot-gpio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h index 274a2767ea49..1ed7b0d1e4f9 100644 --- a/include/linux/mmc/slot-gpio.h +++ b/include/linux/mmc/slot-gpio.h @@ -22,7 +22,6 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id, int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id, unsigned int idx, unsigned int debounce); int mmc_gpiod_set_cd_config(struct mmc_host *host, unsigned long config); -void mmc_gpio_set_cd_isr(struct mmc_host *host, irq_handler_t isr); int mmc_gpio_set_cd_wake(struct mmc_host *host, bool on); void mmc_gpiod_request_cd_irq(struct mmc_host *host); bool mmc_can_gpio_cd(struct mmc_host *host); -- cgit v1.2.3 From 76f9409f813c9dc24a22fa7a587a31983706c4d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:27 +0100 Subject: cgroup: Update file naming comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changed long time ago in commit 8d7e6fb0a1db9 ("cgroup: update cgroup name handling"). Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b20d2d8ef7c..d8d6fe24f375 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -619,9 +619,8 @@ struct cgroup_root { */ struct cftype { /* - * By convention, the name should begin with the name of the - * subsystem, followed by a period. Zero length string indicates - * end of cftype array. + * Name of the subsystem is prepended in cgroup_file_name(). + * Zero length string indicates end of cftype array. */ char name[MAX_CFTYPE_NAME]; unsigned long private; -- cgit v1.2.3 From 4a893bdc18df087ab7dde28a4e8066ae39faa700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:28 +0100 Subject: blk-cgroup: Simplify policy files registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use one set of files when there is no difference between default and legacy files, similar to regular subsys files registration. No functional change. Signed-off-by: Michal Koutný Acked-by: Jens Axboe Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f8ef47f8a634..8e7415c64ed1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -113,6 +113,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); -- cgit v1.2.3 From 1e7dcbfa4b7cddfbe1cde6067d135f5452692256 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 5 Mar 2025 12:26:38 -0800 Subject: KVM: arm64: Remap PMUv3 events onto hardware Map PMUv3 event IDs onto hardware, if the driver exposes such a helper. This is expected to be quite rare, and only useful for non-PMUv3 hardware. Tested-by: Janne Grunau Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250305202641.428114-12-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- include/linux/perf/arm_pmu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 4b5b83677e3f..7ce6dea5bfa9 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -100,6 +100,10 @@ struct arm_pmu { void (*stop)(struct arm_pmu *); void (*reset)(void *); int (*map_event)(struct perf_event *event); + /* + * Called by KVM to map the PMUv3 event space onto non-PMUv3 hardware. + */ + int (*map_pmuv3_event)(unsigned int eventsel); DECLARE_BITMAP(cntr_mask, ARMPMU_MAX_HWEVENTS); bool secure_access; /* 32-bit ARM only */ #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 -- cgit v1.2.3 From 3c021531131c9ee5df5da739819a515326ee9570 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:47 +0000 Subject: x86/resctrl: Add a helper to avoid reaching into the arch code resource list Resctrl occasionally wants to know something about a specific resource, in these cases it reaches into the arch code's rdt_resources_all[] array. Once the filesystem parts of resctrl are moved to /fs/, this means it will need visibility of the architecture specific struct rdt_hw_resource definition, and the array of all resources. All architectures would also need a r_resctrl member in this struct. Instead, abstract this via a helper to allow architectures to do different things here. Move the level enum to the resctrl header and add a helper to retrieve the struct rdt_resource by 'rid'. resctrl_arch_get_resource() should not return NULL for any value in the enum, it may instead return a dummy resource that is !alloc_enabled && !mon_enabled. Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Tony Luck Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Peter Newman Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-3-james.morse@arm.com --- include/linux/resctrl.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index d94abba1c716..37279e2a89da 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -37,6 +37,16 @@ enum resctrl_conf_type { CDP_DATA, }; +enum resctrl_res_level { + RDT_RESOURCE_L3, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + RDT_RESOURCE_SMBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + #define CDP_NUM_TYPES (CDP_DATA + 1) /* @@ -226,6 +236,13 @@ struct rdt_resource { bool cdp_capable; }; +/* + * Get the resource that exists at this level. If the level is not supported + * a dummy/not-capable resource can be returned. Levels >= RDT_NUM_RESOURCES + * will return NULL. + */ +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l); + /** * struct resctrl_schema - configuration abilities of a resource presented to * user-space -- cgit v1.2.3 From 131dab13a82d9edd97dd96d98d2919f56f339331 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:48 +0000 Subject: x86/resctrl: Remove fflags from struct rdt_resource The resctrl arch code specifies whether a resource controls a cache or memory using the fflags field. This field is then used by resctrl to determine which files should be exposed in the filesystem. Allowing the architecture to pick this value means the RFTYPE_ flags have to be in a shared header, and allows an architecture to create a combination that resctrl does not support. Remove the fflags field, and pick the value based on the resource id. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-4-james.morse@arm.com --- include/linux/resctrl.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 37279e2a89da..496ddcaa4ecf 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -210,7 +210,6 @@ enum resctrl_scope { * @format_str: Per resource format string to show domain value * @parse_ctrlval: Per resource function pointer to parse control values * @evt_list: List of monitoring events - * @fflags: flags to choose base and info files * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { @@ -232,7 +231,6 @@ struct rdt_resource { struct resctrl_schema *s, struct rdt_ctrl_domain *d); struct list_head evt_list; - unsigned long fflags; bool cdp_capable; }; -- cgit v1.2.3 From c24f5eab6b2689fd4e97a2d2fe963864036653e6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:49 +0000 Subject: x86/resctrl: Use schema type to determine how to parse schema values Resctrl's architecture code gets to specify a function pointer that is used when parsing schema entries. This is expected to be one of two helpers from the filesystem code. Setting this function pointer allows the architecture code to change the ABI resctrl presents to user-space, and forces resctrl to expose these helpers. Instead, add a schema format enum to choose which schema parser to use. This allows the helpers to be made static and the structs used for passing arguments moved out of shared headers. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-5-james.morse@arm.com --- include/linux/resctrl.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 496ddcaa4ecf..3e1a41356b2a 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -183,7 +183,6 @@ struct resctrl_membw { u32 *mb_map; }; -struct rdt_parse_data; struct resctrl_schema; enum resctrl_scope { @@ -192,6 +191,16 @@ enum resctrl_scope { RESCTRL_L3_NODE, }; +/** + * enum resctrl_schema_fmt - The format user-space provides for a schema. + * @RESCTRL_SCHEMA_BITMAP: The schema is a bitmap in hex. + * @RESCTRL_SCHEMA_RANGE: The schema is a decimal number. + */ +enum resctrl_schema_fmt { + RESCTRL_SCHEMA_BITMAP, + RESCTRL_SCHEMA_RANGE, +}; + /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource @@ -208,7 +217,7 @@ enum resctrl_scope { * @data_width: Character width of data when displaying * @default_ctrl: Specifies default cache cbm or memory B/W percent. * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values + * @schema_fmt: Which format string and parser is used for this schema. * @evt_list: List of monitoring events * @cdp_capable: Is the CDP feature available on this resource */ @@ -227,9 +236,7 @@ struct rdt_resource { int data_width; u32 default_ctrl; const char *format_str; - int (*parse_ctrlval)(struct rdt_parse_data *data, - struct resctrl_schema *s, - struct rdt_ctrl_domain *d); + enum resctrl_schema_fmt schema_fmt; struct list_head evt_list; bool cdp_capable; }; -- cgit v1.2.3 From bb9343c8f2906ac7087a7f9560b37636c220551d Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:50 +0000 Subject: x86/resctrl: Use schema type to determine the schema format string Resctrl's architecture code gets to specify a format string that is used when printing schema entries. This is expected to be one of two values that the filesystem code supports. Setting this format string allows the architecture code to change the ABI resctrl presents to user-space. Instead, use the schema format enum to choose which format string to use. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Reinette Chatre Reviewed-by: Tony Luck Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-6-james.morse@arm.com --- include/linux/resctrl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 3e1a41356b2a..736b9a9a9464 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -216,7 +216,6 @@ enum resctrl_schema_fmt { * @name: Name to use in "schemata" file. * @data_width: Character width of data when displaying * @default_ctrl: Specifies default cache cbm or memory B/W percent. - * @format_str: Per resource format string to show domain value * @schema_fmt: Which format string and parser is used for this schema. * @evt_list: List of monitoring events * @cdp_capable: Is the CDP feature available on this resource @@ -235,7 +234,6 @@ struct rdt_resource { char *name; int data_width; u32 default_ctrl; - const char *format_str; enum resctrl_schema_fmt schema_fmt; struct list_head evt_list; bool cdp_capable; @@ -253,6 +251,7 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l); * user-space * @list: Member of resctrl_schema_all. * @name: The name to use in the "schemata" file. + * @fmt_str: Format string to show domain value. * @conf_type: Whether this schema is specific to code/data. * @res: The resource structure exported by the architecture to describe * the hardware that is configured by this schema. @@ -263,6 +262,7 @@ struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l); struct resctrl_schema { struct list_head list; char name[8]; + const char *fmt_str; enum resctrl_conf_type conf_type; struct rdt_resource *res; u32 num_closid; -- cgit v1.2.3 From 43312b8ea1c61abbc5e3bdc87fe2e18f755d549d Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:51 +0000 Subject: x86/resctrl: Remove data_width and the tabular format The resctrl architecture code provides a data_width for the controls of each resource. This is used to zero pad all control values in the schemata file so they appear in columns. The same is done with the resource names to complete the visual effect. e.g. | SMBA:0=2048 | L3:0=00ff AMD platforms discover their maximum bandwidth for the MB resource from firmware, but hard-code the data_width to 4. If the maximum bandwidth requires more digits - the tabular format is silently broken. This is also broken when the mba_MBps mount option is used as the field width isn't updated. If new schema are added resctrl will need to be able to determine the maximum width. The benefit of this pretty-printing is questionable. Instead of handling runtime discovery of the data_width for AMD platforms, remove the feature. These fields are always zero padded so should be harmless to remove if the whole field has been treated as a number. In the above example, this would now look like this: | SMBA:0=2048 | L3:0=ff Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-7-james.morse@arm.com --- include/linux/resctrl.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 736b9a9a9464..e1a982adef45 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -214,7 +214,6 @@ enum resctrl_schema_fmt { * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. - * @data_width: Character width of data when displaying * @default_ctrl: Specifies default cache cbm or memory B/W percent. * @schema_fmt: Which format string and parser is used for this schema. * @evt_list: List of monitoring events @@ -232,7 +231,6 @@ struct rdt_resource { struct list_head ctrl_domains; struct list_head mon_domains; char *name; - int data_width; u32 default_ctrl; enum resctrl_schema_fmt schema_fmt; struct list_head evt_list; -- cgit v1.2.3 From 634ebb98b929443ea1a5502c93f26d01aedbd5c8 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:52 +0000 Subject: x86/resctrl: Add max_bw to struct resctrl_membw __rdt_get_mem_config_amd() and __get_mem_config_intel() both use the default_ctrl property as a maximum value. This is because the MBA schema works differently between these platforms. Doing this complicates determining whether the default_ctrl property belongs to the arch code, or can be derived from the schema format. Deriving the maximum or default value from the schema format would avoid the architecture code having to tell resctrl such obvious things as the maximum percentage is 100, and the maximum bitmap is all ones. Maximum bandwidth is always going to vary per platform. Add max_bw as a special case. This is currently used for the maximum MBA percentage on Intel platforms, but can be removed from the architecture code if 'percentage' becomes a schema format resctrl supports directly. This value isn't needed for other schema formats. This will allow the default_ctrl to be generated from the schema properties when it is needed. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-8-james.morse@arm.com --- include/linux/resctrl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index e1a982adef45..465f3cf8c4bc 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -165,6 +165,7 @@ enum membw_throttle_mode { /** * struct resctrl_membw - Memory bandwidth allocation related data * @min_bw: Minimum memory bandwidth percentage user can request + * @max_bw: Maximum memory bandwidth value, used as the reset value * @bw_gran: Granularity at which the memory bandwidth is allocated * @delay_linear: True if memory B/W delay is in linear scale * @arch_needs_linear: True if we can't configure non-linear resources @@ -175,6 +176,7 @@ enum membw_throttle_mode { */ struct resctrl_membw { u32 min_bw; + u32 max_bw; u32 bw_gran; u32 delay_linear; bool arch_needs_linear; -- cgit v1.2.3 From dbc58f7eec4039ee8f3ec27b2b41d89500debfac Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:53 +0000 Subject: x86/resctrl: Generate default_ctrl instead of sharing it The struct rdt_resource default_ctrl is used by both the architecture code for resetting the hardware controls, and sometimes by the filesystem code as the default value for the schema, unless the bandwidth software controller is in use. Having the default exposed by the architecture code causes unnecessary duplication for each architecture as the default value must be specified, but can be derived from other schema properties. Now that the maximum bandwidth is explicitly described, resctrl can derive the default value from the schema format and the other resource properties. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-9-james.morse@arm.com --- include/linux/resctrl.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 465f3cf8c4bc..d16dc960f1fc 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -216,7 +216,6 @@ enum resctrl_schema_fmt { * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. - * @default_ctrl: Specifies default cache cbm or memory B/W percent. * @schema_fmt: Which format string and parser is used for this schema. * @evt_list: List of monitoring events * @cdp_capable: Is the CDP feature available on this resource @@ -233,7 +232,6 @@ struct rdt_resource { struct list_head ctrl_domains; struct list_head mon_domains; char *name; - u32 default_ctrl; enum resctrl_schema_fmt schema_fmt; struct list_head evt_list; bool cdp_capable; @@ -268,6 +266,23 @@ struct resctrl_schema { u32 num_closid; }; +/** + * resctrl_get_default_ctrl() - Return the default control value for this + * resource. + * @r: The resource whose default control type is queried. + */ +static inline u32 resctrl_get_default_ctrl(struct rdt_resource *r) +{ + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + return BIT_MASK(r->cache.cbm_len) - 1; + case RESCTRL_SCHEMA_RANGE: + return r->membw.max_bw; + } + + return WARN_ON_ONCE(1); +} + /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); -- cgit v1.2.3 From 6f06aee356bfd0c817cbe83debedd240fbed0719 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:55 +0000 Subject: x86/resctrl: Remove rdtgroup from update_cpu_closid_rmid() update_cpu_closid_rmid() takes a struct rdtgroup as an argument, which it uses to update the local CPUs default pqr values. This is a problem once the resctrl parts move out to /fs/, as the arch code cannot poke around inside struct rdtgroup. Rename update_cpu_closid_rmid() as resctrl_arch_sync_cpus_defaults() to be used as the target of an IPI, and pass the effective CLOSID and RMID in a new struct. Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-11-james.morse@arm.com --- include/linux/resctrl.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index d16dc960f1fc..31808b3ddecb 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -266,6 +266,28 @@ struct resctrl_schema { u32 num_closid; }; +struct resctrl_cpu_defaults { + u32 closid; + u32 rmid; +}; + +/** + * resctrl_arch_sync_cpu_closid_rmid() - Refresh this CPU's CLOSID and RMID. + * Call via IPI. + * @info: If non-NULL, a pointer to a struct resctrl_cpu_defaults + * specifying the new CLOSID and RMID for tasks in the default + * resctrl ctrl and mon group when running on this CPU. If NULL, + * this CPU is not re-assigned to a different default group. + * + * Propagates reassignment of CPUs and/or tasks to different resctrl groups + * when requested by the resctrl core code. + * + * This function records the per-cpu defaults specified by @info (if any), + * and then reconfigures the CPU's hardware CLOSID and RMID for subsequent + * execution based on @current, in the same way as during a task switch. + */ +void resctrl_arch_sync_cpu_closid_rmid(void *info); + /** * resctrl_get_default_ctrl() - Return the default control value for this * resource. -- cgit v1.2.3 From 8079565d177f5c4eac6c2ee2ac6c404eee76d500 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:56 +0000 Subject: x86/resctrl: Expose resctrl fs's init function to the rest of the kernel rdtgroup_init() needs exposing to the rest of the kernel so that arch code can call it once it lives in core code. As this is one of the few functions exposed, rename it to have "resctrl" in the name. The same goes for the exit call. Rename x86's arch code init functions for RDT to have an arch prefix to make it clear these are part of the architecture code. Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-12-james.morse@arm.com --- include/linux/resctrl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 31808b3ddecb..f1979e375da9 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -402,4 +402,7 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; +int __init resctrl_init(void); +void __exit resctrl_exit(void); + #endif /* _RESCTRL_H */ -- cgit v1.2.3 From e3d5138cefbffa8ea1bf8aab3629268bc3381219 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:57 +0000 Subject: x86/resctrl: Move rdt_find_domain() to be visible to arch and fs code rdt_find_domain() finds a domain given a resource and a cache-id. This is used by both the architecture code and the filesystem code. After the filesystem code moves to live in /fs/, this helper is either duplicated by all architectures, or needs exposing by the filesystem code. Add the declaration to the global header file. As it's now globally visible, and has only a handful of callers, swap the 'rdt' for 'resctrl'. Move the function to live with its caller in ctrlmondata.c as the filesystem code will not have anything corresponding to core.c. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Reviewed-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Shaopeng Tan Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-13-james.morse@arm.com --- include/linux/resctrl.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index f1979e375da9..93d9a435f035 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -372,6 +372,20 @@ static inline void resctrl_arch_rmid_read_context_check(void) might_sleep(); } +/** + * resctrl_find_domain() - Search for a domain id in a resource domain list. + * @h: The domain list to search. + * @id: The domain id to search for. + * @pos: A pointer to position in the list id should be inserted. + * + * Search the domain list to find the domain id. If the domain id is + * found, return the domain. NULL otherwise. If the domain id is not + * found (and NULL returned) then the first domain with id bigger than + * the input id can be returned to the caller via @pos. + */ +struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, + struct list_head **pos); + /** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid * and eventid. -- cgit v1.2.3 From f16adbaf9272ea7ed5d7bf8b261be8a9be949c31 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:58 +0000 Subject: x86/resctrl: Move resctrl types to a separate header When resctrl is fully factored into core and per-arch code, each arch will need to use some resctrl common definitions in order to define its own specializations and helpers. Following conventional practice, it would be desirable to put the dependent arch definitions in an header that is included by the common header. However, this can make it awkward to avoid a circular dependency between and the arch header. To avoid such dependencies, move the affected common types and constants into a new header that does not need to depend on or on the arch headers. The same logic applies to the monitor-configuration defines, move these too. Some kind of enumeration for events is needed between the filesystem and architecture code. Take the x86 definition as its convenient for x86. The definition of enum resctrl_event_id is needed to allow the architecture code to define resctrl_arch_mon_ctx_alloc() and resctrl_arch_mon_ctx_free(). The definition of enum resctrl_res_level is needed to allow the architecture code to define resctrl_arch_set_cdp_enabled() and resctrl_arch_get_cdp_enabled(). The bits for mbm_local_bytes_config et al are ABI, and must be the same on all architectures. These are documented in Documentation/arch/x86/resctrl.rst The maintainers entry for these headers was missed when resctrl.h was created. Add a wildcard entry to match both resctrl.h and resctrl_types.h. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-14-james.morse@arm.com --- include/linux/resctrl.h | 21 +---------------- include/linux/resctrl_types.h | 54 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 20 deletions(-) create mode 100644 include/linux/resctrl_types.h (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 93d9a435f035..326f7ba220e7 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -6,6 +6,7 @@ #include #include #include +#include /* CLOSID, RMID value used by the default control group */ #define RESCTRL_RESERVED_CLOSID 0 @@ -37,28 +38,8 @@ enum resctrl_conf_type { CDP_DATA, }; -enum resctrl_res_level { - RDT_RESOURCE_L3, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - RDT_RESOURCE_SMBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - #define CDP_NUM_TYPES (CDP_DATA + 1) -/* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. - */ -enum resctrl_event_id { - QOS_L3_OCCUP_EVENT_ID = 0x01, - QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, - QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, -}; - /** * struct resctrl_staged_config - parsed configuration to be applied * @new_ctrl: new ctrl value to be loaded diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h new file mode 100644 index 000000000000..f26450b3326b --- /dev/null +++ b/include/linux/resctrl_types.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Arm Ltd. + * Based on arch/x86/kernel/cpu/resctrl/internal.h + */ + +#ifndef __LINUX_RESCTRL_TYPES_H +#define __LINUX_RESCTRL_TYPES_H + +/* Reads to Local DRAM Memory */ +#define READS_TO_LOCAL_MEM BIT(0) + +/* Reads to Remote DRAM Memory */ +#define READS_TO_REMOTE_MEM BIT(1) + +/* Non-Temporal Writes to Local Memory */ +#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2) + +/* Non-Temporal Writes to Remote Memory */ +#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3) + +/* Reads to Local Memory the system identifies as "Slow Memory" */ +#define READS_TO_LOCAL_S_MEM BIT(4) + +/* Reads to Remote Memory the system identifies as "Slow Memory" */ +#define READS_TO_REMOTE_S_MEM BIT(5) + +/* Dirty Victims to All Types of Memory */ +#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6) + +/* Max event bits supported */ +#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) + +enum resctrl_res_level { + RDT_RESOURCE_L3, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + RDT_RESOURCE_SMBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +/* + * Event IDs, the values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ +enum resctrl_event_id { + QOS_L3_OCCUP_EVENT_ID = 0x01, + QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, + QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, +}; + +#endif /* __LINUX_RESCTRL_TYPES_H */ -- cgit v1.2.3 From 9be68b144a5b539c8503f2944888f7a30e669291 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:36:59 +0000 Subject: x86/resctrl: Add an arch helper to reset one resource On umount(), resctrl resets each resource back to its default configuration. It only ever does this for all resources in one go. reset_all_ctrls() is architecture specific as it works with struct rdt_hw_resource. Make reset_all_ctrls() an arch helper that resets one resource. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Reviewed-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Shaopeng Tan Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-15-james.morse@arm.com --- include/linux/resctrl.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 326f7ba220e7..487b9657a7b5 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -394,6 +394,15 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, */ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d); +/** + * resctrl_arch_reset_all_ctrls() - Reset the control for each CLOSID to its + * default. + * @r: The resctrl resource to reset. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; -- cgit v1.2.3 From 88464bff035ecb28f8bf52dd9728fdc1aca228f9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:37:02 +0000 Subject: x86/resctrl: Rewrite and move the for_each_*_rdt_resource() walkers The for_each_*_rdt_resource() helpers walk the architecture's array of structures, using the resctrl visible part as an iterator. These became over-complex when the structures were split into a filesystem and architecture-specific struct. This approach avoided the need to touch every call site, and was done before there was a helper to retrieve a resource by rid. Once the filesystem parts of resctrl are moved to /fs/, both the arch's resource array, and the definition of those structures is no longer accessible. To support resctrl, each architecture would have to provide equally complex macros. Rewrite the macro to make use of resctrl_arch_get_resource(), and move these to include/linux/resctrl.h so existing x86 arch code continues to use them. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-18-james.morse@arm.com --- include/linux/resctrl.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 487b9657a7b5..a392480dc4b6 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -26,6 +26,24 @@ int proc_resctrl_show(struct seq_file *m, /* max value for struct rdt_domain's mbps_val */ #define MBA_MAX_MBPS U32_MAX +/* Walk all possible resources, with variants for only controls or monitors. */ +#define for_each_rdt_resource(_r) \ + for ((_r) = resctrl_arch_get_resource(0); \ + (_r) && (_r)->rid < RDT_NUM_RESOURCES; \ + (_r) = resctrl_arch_get_resource((_r)->rid + 1)) + +#define for_each_capable_rdt_resource(r) \ + for_each_rdt_resource((r)) \ + if ((r)->alloc_capable || (r)->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for_each_rdt_resource((r)) \ + if ((r)->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for_each_rdt_resource((r)) \ + if ((r)->mon_capable) + /** * enum resctrl_conf_type - The type of configuration. * @CDP_NONE: No prioritisation, both code and data are controlled or monitored. -- cgit v1.2.3 From d81826f87a80a86cc5963ff3c44f05700ded3fc9 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:37:04 +0000 Subject: x86/resctrl: Add resctrl_arch_is_evt_configurable() to abstract BMEC When BMEC is supported the resctrl event can be configured in a number of ways. This depends on architecture support. rdt_get_mon_l3_config() modifies the struct mon_evt and calls resctrl_file_fflags_init() to create the files that allow the configuration. Splitting this into separate architecture and filesystem parts would require the struct mon_evt and resctrl_file_fflags_init() to be exposed. Instead, add resctrl_arch_is_evt_configurable(), and use this from resctrl_mon_resource_init() to initialise struct mon_evt and call resctrl_file_fflags_init(). resctrl_arch_is_evt_configurable() calls rdt_cpu_has() so it doesn't obviously benefit from being inlined. Putting it in core.c will allow rdt_cpu_has() to eventually become static. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-20-james.morse@arm.com --- include/linux/resctrl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a392480dc4b6..cc76f308e6f3 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -309,6 +309,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. -- cgit v1.2.3 From 650680d651aaf409f097ad904c3fea6c4b3a0884 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:37:05 +0000 Subject: x86/resctrl: Change mon_event_config_{read,write}() to be arch helpers mon_event_config_{read,write}() are called via IPI and access model specific registers to do their work. To support another architecture, this needs abstracting. Rename mon_event_config_{read,write}() to have a "resctrl_arch_" prefix, and move their struct mon_config_info parameter into . This allows another architecture to supply an implementation of these. As struct mon_config_info is now exposed globally, give it a 'resctrl_' prefix. MPAM systems need access to the domain to do this work, add the resource and domain to struct resctrl_mon_config_info. Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-21-james.morse@arm.com --- include/linux/resctrl.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index cc76f308e6f3..90b6563cf532 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -270,6 +270,13 @@ struct resctrl_cpu_defaults { u32 rmid; }; +struct resctrl_mon_config_info { + struct rdt_resource *r; + struct rdt_mon_domain *d; + u32 evtid; + u32 mon_config; +}; + /** * resctrl_arch_sync_cpu_closid_rmid() - Refresh this CPU's CLOSID and RMID. * Call via IPI. @@ -311,6 +318,30 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); __init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); +/** + * resctrl_arch_mon_event_config_write() - Write the config for an event. + * @config_info: struct resctrl_mon_config_info describing the resource, domain + * and event. + * + * Reads resource, domain and eventid from @config_info and writes the + * event config_info->mon_config into hardware. + * + * Called via IPI to reach a CPU that is a member of the specified domain. + */ +void resctrl_arch_mon_event_config_write(void *config_info); + +/** + * resctrl_arch_mon_event_config_read() - Read the config for an event. + * @config_info: struct resctrl_mon_config_info describing the resource, domain + * and event. + * + * Reads resource, domain and eventid from @config_info and reads the + * hardware config value into config_info->mon_config. + * + * Called via IPI to reach a CPU that is a member of the specified domain. + */ +void resctrl_arch_mon_event_config_read(void *config_info); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. -- cgit v1.2.3 From c32a7d7777800b61a5c9272cc94fe21aa919b305 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:37:07 +0000 Subject: x86/resctrl: Move mbm_cfg_mask to struct rdt_resource The mbm_cfg_mask field lists the bits that user-space can set when configuring an event. This value is output via the last_cmd_status file. Once the filesystem parts of resctrl are moved to live in /fs/, the struct rdt_hw_resource is inaccessible to the filesystem code. Because this value is output to user-space, it has to be accessible to the filesystem code. Move it to struct rdt_resource. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-23-james.morse@arm.com --- include/linux/resctrl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 90b6563cf532..914df24ffe48 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -217,6 +217,8 @@ enum resctrl_schema_fmt { * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. * @evt_list: List of monitoring events + * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth + * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { @@ -233,6 +235,7 @@ struct rdt_resource { char *name; enum resctrl_schema_fmt schema_fmt; struct list_head evt_list; + unsigned int mbm_cfg_mask; bool cdp_capable; }; -- cgit v1.2.3 From 4cf9acfc8f1a322576f0666b882d631cfdf714bf Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:37:11 +0000 Subject: x86/resctrl: Make resctrl_arch_pseudo_lock_fn() take a plr resctrl_arch_pseudo_lock_fn() has architecture specific behaviour, and takes a struct rdtgroup as an argument. After the filesystem code moves to /fs/, the definition of struct rdtgroup will not be available to the architecture code. The only reason resctrl_arch_pseudo_lock_fn() wants the rdtgroup is for the CLOSID. Embed that in the pseudo_lock_region as a closid, and move the definition of struct pseudo_lock_region to resctrl.h. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-27-james.morse@arm.com --- include/linux/resctrl.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 914df24ffe48..226cc5c0d765 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -58,6 +58,45 @@ enum resctrl_conf_type { #define CDP_NUM_TYPES (CDP_DATA + 1) +/* + * struct pseudo_lock_region - pseudo-lock region information + * @s: Resctrl schema for the resource to which this + * pseudo-locked region belongs + * @closid: The closid that this pseudo-locked region uses + * @d: RDT domain to which this pseudo-locked region + * belongs + * @cbm: bitmask of the pseudo-locked region + * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread + * completion + * @thread_done: variable used by waitqueue to test if pseudo-locking + * thread completed + * @cpu: core associated with the cache on which the setup code + * will be run + * @line_size: size of the cache lines + * @size: size of pseudo-locked region in bytes + * @kmem: the kernel memory associated with pseudo-locked region + * @minor: minor number of character device associated with this + * region + * @debugfs_dir: pointer to this region's directory in the debugfs + * filesystem + * @pm_reqs: Power management QoS requests related to this region + */ +struct pseudo_lock_region { + struct resctrl_schema *s; + u32 closid; + struct rdt_ctrl_domain *d; + u32 cbm; + wait_queue_head_t lock_thread_wq; + int thread_done; + int cpu; + unsigned int line_size; + unsigned int size; + void *kmem; + unsigned int minor; + struct dentry *debugfs_dir; + struct list_head pm_reqs; +}; + /** * struct resctrl_staged_config - parsed configuration to be applied * @new_ctrl: new ctrl value to be loaded -- cgit v1.2.3 From f62b4e45e0b467cba75ae816338b996c6dc4dafa Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 11 Mar 2025 18:37:14 +0000 Subject: x86/resctrl: Move get_config_index() to a header get_config_index() is used by the architecture specific code to map a CLOSID+type pair to an index in the configuration arrays. MPAM needs to do this too to preserve the ABI to user-space, there is no reason to do it differently. Move the helper to a header file to allow all architectures that either use or emulate CDP to use the same pattern of CLOSID values. Moving this to a header file means it must be marked inline, which matches the existing compiler choice for this static function. Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Reviewed-by: Babu Moger Tested-by: Carl Worth # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Babu Moger Link: https://lore.kernel.org/r/20250311183715.16445-30-james.morse@arm.com --- include/linux/resctrl.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 226cc5c0d765..880351ca3dfc 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -384,6 +384,21 @@ void resctrl_arch_mon_event_config_write(void *config_info); */ void resctrl_arch_mon_event_config_read(void *config_info); +/* For use by arch code to remap resctrl's smaller CDP CLOSID range */ +static inline u32 resctrl_get_config_index(u32 closid, + enum resctrl_conf_type type) +{ + switch (type) { + default: + case CDP_NONE: + return closid; + case CDP_CODE: + return closid * 2 + 1; + case CDP_DATA: + return closid * 2; + } +} + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. -- cgit v1.2.3 From 9bce6b5f8987678b9c6c1fe433af6b5fe41feadc Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Tue, 11 Mar 2025 19:43:59 +0900 Subject: block: change blk_mq_add_to_batch() third argument type to bool Commit 1f47ed294a2b ("block: cleanup and fix batch completion adding conditions") modified the evaluation criteria for the third argument, 'ioerror', in the blk_mq_add_to_batch() function. Initially, the function had checked if 'ioerror' equals zero. Following the commit, it started checking for negative error values, with the presumption that such values, for instance -EIO, would be passed in. However, blk_mq_add_to_batch() callers do not pass negative error values. Instead, they pass status codes defined in various ways: - NVMe PCI and Apple drivers pass NVMe status code - virtio_blk driver passes the virtblk request header status byte - null_blk driver passes blk_status_t These codes are either zero or positive, therefore the revised check fails to function as intended. Specifically, with the NVMe PCI driver, this modification led to the failure of the blktests test case nvme/039. In this test scenario, errors are artificially injected to the NVMe driver, resulting in positive NVMe status codes passed to blk_mq_add_to_batch(), which unexpectedly processes the failed I/O in a batch. Hence the failure. To correct the ioerror check within blk_mq_add_to_batch(), make all callers to uniformly pass the argument as boolean. Modify the callers to check their specific status codes and pass the boolean value 'is_error'. Also describe the arguments of blK_mq_add_to_batch as kerneldoc. Fixes: 1f47ed294a2b ("block: cleanup and fix batch completion adding conditions") Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20250311104359.1767728-3-shinichiro.kawasaki@wdc.com [axboe: fold in documentation update] Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 71f4f0cc3dac..aba9c24486aa 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -852,12 +852,20 @@ static inline bool blk_mq_is_reserved_rq(struct request *rq) return rq->rq_flags & RQF_RESV; } -/* +/** + * blk_mq_add_to_batch() - add a request to the completion batch + * @req: The request to add to batch + * @iob: The batch to add the request + * @is_error: Specify true if the request failed with an error + * @complete: The completaion handler for the request + * * Batched completions only work when there is no I/O error and no special * ->end_io handler. + * + * Return: true when the request was added to the batch, otherwise false */ static inline bool blk_mq_add_to_batch(struct request *req, - struct io_comp_batch *iob, int ioerror, + struct io_comp_batch *iob, bool is_error, void (*complete)(struct io_comp_batch *)) { /* @@ -865,7 +873,7 @@ static inline bool blk_mq_add_to_batch(struct request *req, * 1) No batch container * 2) Has scheduler data attached * 3) Not a passthrough request and end_io set - * 4) Not a passthrough request and an ioerror + * 4) Not a passthrough request and failed with an error */ if (!iob) return false; @@ -874,7 +882,7 @@ static inline bool blk_mq_add_to_batch(struct request *req, if (!blk_rq_is_passthrough(req)) { if (req->end_io) return false; - if (ioerror < 0) + if (is_error) return false; } -- cgit v1.2.3 From 066e053fe208a3b83ee89dc5a192146add688861 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 12 Mar 2025 08:38:47 +0100 Subject: fsnotify: add pre-content hooks on mmap() Pre-content hooks in page faults introduces potential deadlock of HSM handler in userspace with filesystem freezing. The requirement with pre-content event is that for every accessed file range an event covering at least this range will be generated at least once before the file data is accesses. In preparation to disabling pre-content event hooks on page faults, add pre-content hooks at mmap() variants for the entire mmaped range, so HSM can fill content when user requests to map a portion of the file. Note that exec() variant also calls vm_mmap_pgoff() internally to map code sections, so pre-content hooks are also generated in this case. Link: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/ Suggested-by: Josef Bacik Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/20250312073852.2123409-2-amir73il@gmail.com --- include/linux/fsnotify.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 6a33288bd6a1..83d3ac97f826 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -170,6 +170,21 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } +/* + * fsnotify_mmap_perm - permission hook before mmap of file range + */ +static inline int fsnotify_mmap_perm(struct file *file, int prot, + const loff_t off, size_t len) +{ + /* + * mmap() generates only pre-content events. + */ + if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode))) + return 0; + + return fsnotify_pre_content(&file->f_path, &off, len); +} + /* * fsnotify_truncate_perm - permission hook before file truncate */ @@ -223,6 +238,12 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return 0; } +static inline int fsnotify_mmap_perm(struct file *file, int prot, + const loff_t off, size_t len) +{ + return 0; +} + static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) { return 0; -- cgit v1.2.3 From 425b1c97b07f2290700f708edabef32861e2b2db Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 10 Mar 2025 15:24:33 -0700 Subject: PNP: Expand length of fixup id string GCC 15's -Wunterminated-string-initialization saw that "id" was not including the required trailing NUL character. Instead of marking "id" with __nonstring[1], expand the length of the string as it is used in (debugging) format strings that expect a properly formed C string. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117178 [1] Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250310222432.work.826-kees@kernel.org Signed-off-by: Rafael J. Wysocki --- include/linux/pnp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pnp.h b/include/linux/pnp.h index b7a7158aaf65..23fe3eaf242d 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -290,7 +290,7 @@ static inline void pnp_set_drvdata(struct pnp_dev *pdev, void *data) } struct pnp_fixup { - char id[7]; + char id[8]; void (*quirk_function) (struct pnp_dev *dev); /* fixup function */ }; -- cgit v1.2.3 From 0a13c1e0a449917b29c45d90701eededa69c99d3 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 7 Mar 2025 20:47:26 -0800 Subject: net: revert to lockless TC_SETUP_BLOCK and TC_SETUP_FT There is a couple of places from which we can arrive to ndo_setup_tc with TC_SETUP_BLOCK/TC_SETUP_FT: - netlink - netlink notifier - netdev notifier Locking netdev too deep in this call chain seems to be problematic (especially assuming some/all of the call_netdevice_notifiers NETDEV_UNREGISTER) might soon be running with the instance lock). Revert to lockless ndo_setup_tc for TC_SETUP_BLOCK/TC_SETUP_FT. NFT framework already takes care of most of the locking. Document the assumptions. ndo_setup_tc TC_SETUP_BLOCK nft_block_offload_cmd nft_chain_offload_cmd nft_flow_block_chain nft_flow_offload_chain nft_flow_rule_offload_abort nft_flow_rule_offload_commit nft_flow_rule_offload_commit nf_tables_commit nfnetlink_rcv_batch nfnetlink_rcv_skb_batch nfnetlink_rcv nft_offload_netdev_event NETDEV_UNREGISTER notifier ndo_setup_tc TC_SETUP_FT nf_flow_table_offload_cmd nf_flow_table_offload_setup nft_unregister_flowtable_hook nft_register_flowtable_net_hooks nft_flowtable_update nf_tables_newflowtable nfnetlink_rcv_batch (.call NFNL_CB_BATCH) nft_flowtable_update nf_tables_newflowtable nft_flowtable_event nf_tables_flowtable_event NETDEV_UNREGISTER notifier __nft_unregister_flowtable_net_hooks nft_unregister_flowtable_net_hooks nf_tables_commit nfnetlink_rcv_batch (.call NFNL_CB_BATCH) __nf_tables_abort nf_tables_abort nfnetlink_rcv_batch __nft_release_hook __nft_release_hooks nf_tables_pre_exit_net -> module unload nft_rcv_nl_event netlink_register_notifier (oh boy) nft_register_flowtable_net_hooks nft_flowtable_update nf_tables_newflowtable nf_tables_newflowtable Fixes: c4f0f30b424e ("net: hold netdev instance lock during nft ndo_setup_tc") Signed-off-by: Stanislav Fomichev Reported-by: syzbot+0afb4bcf91e5a1afdcad@syzkaller.appspotmail.com Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250308044726.1193222-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9a297757df7e..0dbfe069a6e3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3316,8 +3316,6 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack); void netif_close(struct net_device *dev); void dev_close(struct net_device *dev); void dev_close_many(struct list_head *head, bool unlink); -int dev_setup_tc(struct net_device *dev, enum tc_setup_type type, - void *type_data); void netif_disable_lro(struct net_device *dev); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); -- cgit v1.2.3 From 13b4f9e126cb55b65430bae7e704cea23c78620c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 7 Mar 2025 02:17:50 +0000 Subject: PM: sleep: Remove unused pm_generic_ wrappers pm_generic_thaw_early() has been unused since 2016's commit 294f47ffd55c ("PM / Domains: Remove redundant system PM callbacks") pm_generic_freeze_late() has been unused since 2019's commit 3cd7957e85e6 ("ACPI: PM: Simplify and fix PM domain hibernation callbacks") Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250307021750.457600-1-linux@treblig.org Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 63a8dffda787..f0bd8fbae4f2 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -839,10 +839,8 @@ extern int pm_generic_resume_early(struct device *dev); extern int pm_generic_resume_noirq(struct device *dev); extern int pm_generic_resume(struct device *dev); extern int pm_generic_freeze_noirq(struct device *dev); -extern int pm_generic_freeze_late(struct device *dev); extern int pm_generic_freeze(struct device *dev); extern int pm_generic_thaw_noirq(struct device *dev); -extern int pm_generic_thaw_early(struct device *dev); extern int pm_generic_thaw(struct device *dev); extern int pm_generic_restore_noirq(struct device *dev); extern int pm_generic_restore_early(struct device *dev); @@ -884,10 +882,8 @@ static inline void dpm_for_each_dev(void *data, void (*fn)(struct device *, void #define pm_generic_resume_noirq NULL #define pm_generic_resume NULL #define pm_generic_freeze_noirq NULL -#define pm_generic_freeze_late NULL #define pm_generic_freeze NULL #define pm_generic_thaw_noirq NULL -#define pm_generic_thaw_early NULL #define pm_generic_thaw NULL #define pm_generic_restore_noirq NULL #define pm_generic_restore_early NULL -- cgit v1.2.3 From b688f369ae0d5d25865f5441fa62e54c7d5d0de6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 10 Mar 2025 14:42:48 -0700 Subject: compiler_types: Introduce __nonstring_array GCC has expanded support of the "nonstring" attribute so that it can be applied to arrays of character arrays[1], which is needed to identify correct static initialization of those kinds of objects. Since this was not supported prior to GCC 15, we need to distinguish the usage of Linux's existing __nonstring macro for the attribute for non-multi-dimensional char arrays. Until GCC 15 is the minimum version, use __nonstring_array to mark arrays of non-string character arrays. (Regular non-string character arrays can continue to use __nonstring.) Once GCC 15 is the minimum compiler version we can replace all uses of __nonstring_array with just __nonstring and remove this macro. This allows for changes like this: -static const char table_sigs[][ACPI_NAMESEG_SIZE] __initconst = { +static const char table_sigs[][ACPI_NAMESEG_SIZE] __nonstring_array __initconst = { ACPI_SIG_BERT, ACPI_SIG_BGRT, ACPI_SIG_CPEP, ACPI_SIG_ECDT, Which will silence the coming -Wunterminated-string-initialization warnings in GCC 15: In file included from ../include/acpi/actbl.h:371, from ../include/acpi/acpi.h:26, from ../include/linux/acpi.h:26, from ../drivers/acpi/tables.c:19: ../include/acpi/actbl1.h:30:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (5 chars into 4 available) [-Wunterminated-string-initialization] 30 | #define ACPI_SIG_BERT "BERT" /* Boot Error Record Table */ | ^~~~~~ ../drivers/acpi/tables.c:400:9: note: in expansion of macro 'ACPI_SIG_BERT' 400 | ACPI_SIG_BERT, ACPI_SIG_BGRT, ACPI_SIG_CPEP, ACPI_SIG_ECDT, | ^~~~~~~~~~~~~ ../include/acpi/actbl1.h:31:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (5 chars into 4 available) [-Wunterminated-string-initialization] 31 | #define ACPI_SIG_BGRT "BGRT" /* Boot Graphics Resource Table */ | ^~~~~~ Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117178 [1] Acked-by: Miguel Ojeda Link: https://lore.kernel.org/r/20250310214244.work.194-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/compiler_types.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4ad3e900bc3d..e09d323be845 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -348,6 +348,18 @@ struct ftrace_likely_data { # define __counted_by(member) #endif +/* + * Optional: only supported since gcc >= 15 + * Optional: not supported by Clang + * + * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117178 + */ +#ifdef CONFIG_CC_HAS_MULTIDIMENSIONAL_NONSTRING +# define __nonstring_array __attribute__((__nonstring__)) +#else +# define __nonstring_array +#endif + /* * Apply __counted_by() when the Endianness matches to increase test coverage. */ -- cgit v1.2.3 From 3efeeaf85f5cab84aa05003308eddb62e2acf5bd Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 7 Mar 2025 21:23:47 +0000 Subject: PM: clk: Remove unused pm_clk_remove() pm_clk_remove() is currently unused. It hasn't been used since at least 2011 when it was renamed from pm_runtime_clk_remove() by commit 3d5c30367cbc ("PM: Rename clock management functions") Remove it. Note that the __pm_clk_remove() is still used and is left in. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250307212347.68785-1-linux@treblig.org Signed-off-by: Rafael J. Wysocki --- include/linux/pm_clock.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index 45c3f3ccbaf8..c3b46fa358d3 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -42,7 +42,6 @@ extern void pm_clk_destroy(struct device *dev); extern int pm_clk_add(struct device *dev, const char *con_id); extern int pm_clk_add_clk(struct device *dev, struct clk *clk); extern int of_pm_clk_add_clks(struct device *dev); -extern void pm_clk_remove(struct device *dev, const char *con_id); extern void pm_clk_remove_clk(struct device *dev, struct clk *clk); extern int pm_clk_suspend(struct device *dev); extern int pm_clk_resume(struct device *dev); @@ -75,9 +74,6 @@ static inline int of_pm_clk_add_clks(struct device *dev) { return -EINVAL; } -static inline void pm_clk_remove(struct device *dev, const char *con_id) -{ -} #define pm_clk_suspend NULL #define pm_clk_resume NULL static inline void pm_clk_remove_clk(struct device *dev, struct clk *clk) -- cgit v1.2.3 From 1d25bdd3f3831bb1b9512d4b5afcd2dea8a0c515 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Mar 2025 09:13:32 +0100 Subject: posix-timers: Rework timer removal sys_timer_delete() and the do_exit() cleanup function itimer_delete() are doing the same thing, but have needlessly different implementations instead of sharing the code. The other oddity of timer deletion is the fact that the timer is not invalidated before the actual deletion happens, which allows concurrent lookups to succeed. That's wrong because a timer which is in the process of being deleted should not be visible and any actions like signal queueing, delivery and rearming should not happen once the task, which invoked timer_delete(), has the timer locked. Rework the code so that: 1) The signal queueing and delivery code ignore timers which are marked invalid 2) The deletion implementation between sys_timer_delete() and itimer_delete() is shared 3) The timer is invalidated and removed from the linked lists before the deletion callback of the relevant clock is invoked. That requires to rework timer_wait_running() as it does a lookup of the timer when relocking it at the end. In case of deletion this lookup would fail due to the preceding invalidation and the wait loop would terminate prematurely. But due to the preceding invalidation the timer cannot be accessed by other tasks anymore, so there is no way that the timer has been freed after the timer lock has been dropped. Move the re-validation out of timer_wait_running() and handle it at the only other usage site, timer_settime(). Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/87zfht1exf.ffs@tglx --- include/linux/posix-timers.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index f11f10c97bd9..e714a55e1ddd 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -240,6 +240,13 @@ static inline void posixtimer_sigqueue_putref(struct sigqueue *q) posixtimer_putref(tmr); } + +static inline bool posixtimer_valid(const struct k_itimer *timer) +{ + unsigned long val = (unsigned long)timer->it_signal; + + return !(val & 0x1UL); +} #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { } -- cgit v1.2.3 From 538d710ec74233f99dc0fd604d45a2b6143c8e2c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 8 Mar 2025 17:48:34 +0100 Subject: posix-timers: Make lock_timer() use guard() The lookup and locking of posix timers requires the same repeating pattern at all usage sites: tmr = lock_timer(tiner_id); if (!tmr) return -EINVAL; .... unlock_timer(tmr); Solve this with a guard implementation, which works in most places out of the box except for those, which need to unlock the timer inside the guard scope. Though the only places where this matters are timer_delete() and timer_settime(). In both cases the timer pointer needs to be preserved across the end of the scope, which is solved by storing the pointer in a variable outside of the scope. timer_settime() also has to protect the timer with RCU before unlocking, which obviously can't use guard(rcu) before leaving the guard scope as that guard is cleaned up before the unlock. Solve this by providing the RCU protection open coded. [ tglx: Made it work and added change log ] Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20250224162103.GD11590@noisy.programming.kicks-ass.net Link: https://lore.kernel.org/all/20250308155624.087465658@linutronix.de --- include/linux/cleanup.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index ec00e3f7af2b..a176abfccba8 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -291,11 +291,21 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond -#define DEFINE_GUARD(_name, _type, _lock, _unlock) \ +#define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return (void *)(__force unsigned long)*(_exp); } + +#define DEFINE_CLASS_IS_GUARD(_name) \ __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ + __DEFINE_GUARD_LOCK_PTR(_name, _T) + +#define DEFINE_CLASS_IS_COND_GUARD(_name) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name, true); \ + __DEFINE_GUARD_LOCK_PTR(_name, _T) + +#define DEFINE_GUARD(_name, _type, _lock, _unlock) \ DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ - static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ - { return (void *)(__force unsigned long)*_T; } + DEFINE_CLASS_IS_GUARD(_name) #define DEFINE_GUARD_COND(_name, _ext, _condlock) \ __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ @@ -375,11 +385,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ if (_T->lock) { _unlock; } \ } \ \ -static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ -{ \ - return (void *)(__force unsigned long)_T->lock; \ -} - +__DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ static inline class_##_name##_t class_##_name##_constructor(_type *l) \ -- cgit v1.2.3 From feb864ee99a2d8a22800342388401f3a3b90d42b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 8 Mar 2025 17:48:36 +0100 Subject: posix-timers: Make signal_struct:: Next_posix_timer_id an atomic_t The global hash_lock protecting the posix timer hash table can be heavily contended especially when there is an extensive linear search for a timer ID. Timer IDs are handed out by monotonically increasing next_posix_timer_id and then validating that there is no timer with the same ID in the hash table. Both operations happen with the global hash lock held. To reduce the hash lock contention the hash will be reworked to a scaled hash with per bucket locks, which requires to handle the ID counter lockless. Prepare for this by making next_posix_timer_id an atomic_t, which can be used lockless with atomic_inc_return(). [ tglx: Adopted from Eric's series, massaged change log and simplified it ] Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20250219125522.2535263-2-edumazet@google.com Link: https://lore.kernel.org/all/20250308155624.151545978@linutronix.de --- include/linux/sched/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index d5d03d919df8..72649d7fce2a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -136,7 +136,7 @@ struct signal_struct { #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ - unsigned int next_posix_timer_id; + atomic_t next_posix_timer_id; struct hlist_head posix_timers; struct hlist_head ignored_posix_timers; -- cgit v1.2.3 From 5fa75a432f1a6b1402edd8802ecc14f8bbb90e49 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 8 Mar 2025 17:48:42 +0100 Subject: posix-timers: Avoid false cacheline sharing struct k_itimer has the hlist_node, which is used for lookup in the hash bucket, and the timer lock in the same cache line. That's obviously bad, if one CPU fiddles with a timer and the other is walking the hash bucket on which that timer is queued. Avoid this by restructuring struct k_itimer, so that the read mostly (only modified during setup and teardown) fields are in the first cache line and the lock and the rest of the fields which get written to are in cacheline 2-N. Reduces cacheline contention in a test case of 64 processes creating and accessing 20000 timers each by almost 30% according to perf. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20250308155624.341108067@linutronix.de --- include/linux/posix-timers.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index e714a55e1ddd..094ef5778c0f 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -177,23 +177,26 @@ static inline void posix_cputimers_init_work(void) { } * @rcu: RCU head for freeing the timer. */ struct k_itimer { - struct hlist_node list; - struct hlist_node ignored_list; + /* 1st cacheline contains read-mostly fields */ struct hlist_node t_hash; - spinlock_t it_lock; - const struct k_clock *kclock; - clockid_t it_clock; + struct hlist_node list; timer_t it_id; + clockid_t it_clock; + int it_sigev_notify; + enum pid_type it_pid_type; + struct signal_struct *it_signal; + const struct k_clock *kclock; + + /* 2nd cacheline and above contain fields which are modified regularly */ + spinlock_t it_lock; int it_status; bool it_sig_periodic; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; unsigned int it_sigqueue_seq; - int it_sigev_notify; - enum pid_type it_pid_type; ktime_t it_interval; - struct signal_struct *it_signal; + struct hlist_node ignored_list; union { struct pid *it_pid; struct task_struct *it_process; @@ -210,7 +213,7 @@ struct k_itimer { } alarm; } it; struct rcu_head rcu; -}; +} ____cacheline_aligned_in_smp; void run_posix_cpu_timers(void); void posix_cpu_timers_exit(struct task_struct *task); -- cgit v1.2.3 From ec2d0c04624b3c8a7eb1682e006717fa20cfbe24 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Mar 2025 23:07:44 +0100 Subject: posix-timers: Provide a mechanism to allocate a given timer ID Checkpoint/Restore in Userspace (CRIU) requires to reconstruct posix timers with the same timer ID on restore. It uses sys_timer_create() and relies on the monotonic increasing timer ID provided by this syscall. It creates and deletes timers until the desired ID is reached. This is can loop for a long time, when the checkpointed process had a very sparse timer ID range. It has been debated to implement a new syscall to allow the creation of timers with a given timer ID, but that's tideous due to the 32/64bit compat issues of sigevent_t and of dubious value. The restore mechanism of CRIU creates the timers in a state where all threads of the restored process are held on a barrier and cannot issue syscalls. That means the restorer task has exclusive control. This allows to address this issue with a prctl() so that the restorer thread can do: if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON)) goto linear_mode; create_timers_with_explicit_ids(); prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF); This is backwards compatible because the prctl() fails on older kernels and CRIU can fall back to the linear timer ID mechanism. CRIU versions which do not know about the prctl() just work as before. Implement the prctl() and modify timer_create() so that it copies the requested timer ID from userspace by utilizing the existing timer_t pointer, which is used to copy out the allocated timer ID on success. If the prctl() is disabled, which it is by default, timer_create() works as before and does not try to read from the userspace pointer. There is no problem when a broken or rogue user space application enables the prctl(). If the user space pointer does not contain a valid ID, then timer_create() fails. If the data is not initialized, but constains a random valid ID, timer_create() will create that random timer ID or fail if the ID is already given out. As CRIU must use the raw syscall to avoid manipulating the internal state of the restored process, this has no library dependencies and can be adopted by CRIU right away. Recreating two timers with IDs 1000000 and 2000000 takes 1.5 seconds with the create/delete method. With the prctl() it takes 3 microseconds. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Cyrill Gorcunov Tested-by: Cyrill Gorcunov Link: https://lore.kernel.org/all/87jz8vz0en.ffs@tglx --- include/linux/posix-timers.h | 2 ++ include/linux/sched/signal.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 094ef5778c0f..dd48c64b605e 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sigqueue *q); void posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); +long posixtimer_create_prctl(unsigned long ctrl); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ @@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itimer(struct task_struct *p) { } static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { return false; } static inline void posixtimer_free_timer(struct k_itimer *timer) { } +static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 72649d7fce2a..1ef1edbaaf79 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -136,6 +136,7 @@ struct signal_struct { #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ + unsigned int timer_create_restore_ids:1; atomic_t next_posix_timer_id; struct hlist_head posix_timers; struct hlist_head ignored_posix_timers; -- cgit v1.2.3 From 75618ac6e98faee6ed1f17ae64875cc2d7784204 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 13 Mar 2025 09:23:18 +0530 Subject: block: remove unused parameter 'q' parameter in __blk_rq_map_sg() request_queue param is no longer used by blk_rq_map_sg and __blk_rq_map_sg. Remove it. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250313035322.243239-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9ebb53f031cd..d99024423355 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1155,14 +1155,13 @@ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) return max_t(unsigned short, rq->nr_phys_segments, 1); } -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg); -static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, + struct scatterlist **last_sg); +static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) { struct scatterlist *last_sg = NULL; - return __blk_rq_map_sg(q, rq, sglist, &last_sg); + return __blk_rq_map_sg(rq, sglist, &last_sg); } void blk_dump_rq_flags(struct request *, char *); -- cgit v1.2.3 From 537625233537179cb2e8293b2c0dc9c989363f41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 9 Mar 2025 09:41:42 +0100 Subject: genirq/msi: Make a few functions static None of these functions are used outside of the MSI core. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250309084110.204054172@linutronix.de --- include/linux/msi.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index b10093c4d00e..df2dd6355e1c 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -81,7 +81,6 @@ struct device_attribute; struct irq_domain; struct irq_affinity_desc; -void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); #else @@ -603,8 +602,6 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid); bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, enum irq_domain_bus_token bus_token); -int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last); int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs); @@ -613,8 +610,6 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *cookie); -void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last); void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid); -- cgit v1.2.3 From ed33479b7beb2b2dc9649a4e7474b47253d554f9 Mon Sep 17 00:00:00 2001 From: Kaustabh Chakraborty Date: Sat, 1 Mar 2025 01:07:13 +0530 Subject: mfd: sec: Add support for S2MPU05 PMIC Add support for Samsung's S2MPU05 PMIC. It's the primary PMIC used by Exynos7870 devices. It houses regulators (21 LDOs and 5 BUCKs) and a RTC clock device. Signed-off-by: Kaustabh Chakraborty Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250301-exynos7870-pmic-regulators-v3-2-808d0b47a564@disroot.org Signed-off-by: Lee Jones --- include/linux/mfd/samsung/core.h | 1 + include/linux/mfd/samsung/irq.h | 44 +++++++++ include/linux/mfd/samsung/s2mpu05.h | 183 ++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+) create mode 100644 include/linux/mfd/samsung/s2mpu05.h (limited to 'include/linux') diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h index 750274d41fc0..f35314458fd2 100644 --- a/include/linux/mfd/samsung/core.h +++ b/include/linux/mfd/samsung/core.h @@ -44,6 +44,7 @@ enum sec_device_type { S2MPS14X, S2MPS15X, S2MPU02, + S2MPU05, }; /** diff --git a/include/linux/mfd/samsung/irq.h b/include/linux/mfd/samsung/irq.h index 3fd2775eb9bb..978f7af66f74 100644 --- a/include/linux/mfd/samsung/irq.h +++ b/include/linux/mfd/samsung/irq.h @@ -150,6 +150,50 @@ enum s2mpu02_irq { /* Masks for interrupts are the same as in s2mps11 */ #define S2MPS14_IRQ_TSD_MASK (1 << 2) +enum s2mpu05_irq { + S2MPU05_IRQ_PWRONF, + S2MPU05_IRQ_PWRONR, + S2MPU05_IRQ_JIGONBF, + S2MPU05_IRQ_JIGONBR, + S2MPU05_IRQ_ACOKF, + S2MPU05_IRQ_ACOKR, + S2MPU05_IRQ_PWRON1S, + S2MPU05_IRQ_MRB, + + S2MPU05_IRQ_RTC60S, + S2MPU05_IRQ_RTCA1, + S2MPU05_IRQ_RTCA0, + S2MPU05_IRQ_SMPL, + S2MPU05_IRQ_RTC1S, + S2MPU05_IRQ_WTSR, + + S2MPU05_IRQ_INT120C, + S2MPU05_IRQ_INT140C, + S2MPU05_IRQ_TSD, + + S2MPU05_IRQ_NR, +}; + +#define S2MPU05_IRQ_PWRONF_MASK BIT(0) +#define S2MPU05_IRQ_PWRONR_MASK BIT(1) +#define S2MPU05_IRQ_JIGONBF_MASK BIT(2) +#define S2MPU05_IRQ_JIGONBR_MASK BIT(3) +#define S2MPU05_IRQ_ACOKF_MASK BIT(4) +#define S2MPU05_IRQ_ACOKR_MASK BIT(5) +#define S2MPU05_IRQ_PWRON1S_MASK BIT(6) +#define S2MPU05_IRQ_MRB_MASK BIT(7) + +#define S2MPU05_IRQ_RTC60S_MASK BIT(0) +#define S2MPU05_IRQ_RTCA1_MASK BIT(1) +#define S2MPU05_IRQ_RTCA0_MASK BIT(2) +#define S2MPU05_IRQ_SMPL_MASK BIT(3) +#define S2MPU05_IRQ_RTC1S_MASK BIT(4) +#define S2MPU05_IRQ_WTSR_MASK BIT(5) + +#define S2MPU05_IRQ_INT120C_MASK BIT(0) +#define S2MPU05_IRQ_INT140C_MASK BIT(1) +#define S2MPU05_IRQ_TSD_MASK BIT(2) + enum s5m8767_irq { S5M8767_IRQ_PWRR, S5M8767_IRQ_PWRF, diff --git a/include/linux/mfd/samsung/s2mpu05.h b/include/linux/mfd/samsung/s2mpu05.h new file mode 100644 index 000000000000..fcdb6c8adb03 --- /dev/null +++ b/include/linux/mfd/samsung/s2mpu05.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (c) 2015 Samsung Electronics Co., Ltd + * Copyright (c) 2025 Kaustabh Chakraborty + */ + +#ifndef __LINUX_MFD_S2MPU05_H +#define __LINUX_MFD_S2MPU05_H + +/* S2MPU05 registers */ +enum S2MPU05_reg { + S2MPU05_REG_ID, + S2MPU05_REG_INT1, + S2MPU05_REG_INT2, + S2MPU05_REG_INT3, + S2MPU05_REG_INT1M, + S2MPU05_REG_INT2M, + S2MPU05_REG_INT3M, + S2MPU05_REG_ST1, + S2MPU05_REG_ST2, + S2MPU05_REG_PWRONSRC, + S2MPU05_REG_OFFSRC, + S2MPU05_REG_BU_CHG, + S2MPU05_REG_RTC_BUF, + S2MPU05_REG_CTRL1, + S2MPU05_REG_CTRL2, + S2MPU05_REG_ETC_TEST, + S2MPU05_REG_OTP_ADRL, + S2MPU05_REG_OTP_ADRH, + S2MPU05_REG_OTP_DATA, + S2MPU05_REG_MON1SEL, + S2MPU05_REG_MON2SEL, + S2MPU05_REG_CTRL3, + S2MPU05_REG_ETC_OTP, + S2MPU05_REG_UVLO, + S2MPU05_REG_TIME_CTRL1, + S2MPU05_REG_TIME_CTRL2, + S2MPU05_REG_B1CTRL1, + S2MPU05_REG_B1CTRL2, + S2MPU05_REG_B2CTRL1, + S2MPU05_REG_B2CTRL2, + S2MPU05_REG_B2CTRL3, + S2MPU05_REG_B2CTRL4, + S2MPU05_REG_B3CTRL1, + S2MPU05_REG_B3CTRL2, + S2MPU05_REG_B3CTRL3, + S2MPU05_REG_B4CTRL1, + S2MPU05_REG_B4CTRL2, + S2MPU05_REG_B5CTRL1, + S2MPU05_REG_B5CTRL2, + S2MPU05_REG_BUCK_RAMP, + S2MPU05_REG_LDO_DVS1, + S2MPU05_REG_LDO_DVS9, + S2MPU05_REG_LDO_DVS10, + S2MPU05_REG_L1CTRL, + S2MPU05_REG_L2CTRL, + S2MPU05_REG_L3CTRL, + S2MPU05_REG_L4CTRL, + S2MPU05_REG_L5CTRL, + S2MPU05_REG_L6CTRL, + S2MPU05_REG_L7CTRL, + S2MPU05_REG_L8CTRL, + S2MPU05_REG_L9CTRL1, + S2MPU05_REG_L9CTRL2, + S2MPU05_REG_L10CTRL, + S2MPU05_REG_L11CTRL1, + S2MPU05_REG_L11CTRL2, + S2MPU05_REG_L12CTRL, + S2MPU05_REG_L13CTRL, + S2MPU05_REG_L14CTRL, + S2MPU05_REG_L15CTRL, + S2MPU05_REG_L16CTRL, + S2MPU05_REG_L17CTRL1, + S2MPU05_REG_L17CTRL2, + S2MPU05_REG_L18CTRL1, + S2MPU05_REG_L18CTRL2, + S2MPU05_REG_L19CTRL, + S2MPU05_REG_L20CTRL, + S2MPU05_REG_L21CTRL, + S2MPU05_REG_L22CTRL, + S2MPU05_REG_L23CTRL, + S2MPU05_REG_L24CTRL, + S2MPU05_REG_L25CTRL, + S2MPU05_REG_L26CTRL, + S2MPU05_REG_L27CTRL, + S2MPU05_REG_L28CTRL, + S2MPU05_REG_L29CTRL, + S2MPU05_REG_L30CTRL, + S2MPU05_REG_L31CTRL, + S2MPU05_REG_L32CTRL, + S2MPU05_REG_L33CTRL, + S2MPU05_REG_L34CTRL, + S2MPU05_REG_L35CTRL, + S2MPU05_REG_LDO_DSCH1, + S2MPU05_REG_LDO_DSCH2, + S2MPU05_REG_LDO_DSCH3, + S2MPU05_REG_LDO_DSCH4, + S2MPU05_REG_LDO_DSCH5, + S2MPU05_REG_LDO_CTRL1, + S2MPU05_REG_LDO_CTRL2, + S2MPU05_REG_TCXO_CTRL, + S2MPU05_REG_SELMIF, +}; + +/* S2MPU05 regulator ids */ +enum S2MPU05_regulators { + S2MPU05_LDO1, + S2MPU05_LDO2, + S2MPU05_LDO3, + S2MPU05_LDO4, + S2MPU05_LDO5, + S2MPU05_LDO6, + S2MPU05_LDO7, + S2MPU05_LDO8, + S2MPU05_LDO9, + S2MPU05_LDO10, + S2MPU05_LDO11, + S2MPU05_LDO12, + S2MPU05_LDO13, + S2MPU05_LDO14, + S2MPU05_LDO15, + S2MPU05_LDO16, + S2MPU05_LDO17, + S2MPU05_LDO18, + S2MPU05_LDO19, + S2MPU05_LDO20, + S2MPU05_LDO21, + S2MPU05_LDO22, + S2MPU05_LDO23, + S2MPU05_LDO24, + S2MPU05_LDO25, + S2MPU05_LDO26, + S2MPU05_LDO27, + S2MPU05_LDO28, + S2MPU05_LDO29, + S2MPU05_LDO30, + S2MPU05_LDO31, + S2MPU05_LDO32, + S2MPU05_LDO33, + S2MPU05_LDO34, + S2MPU05_LDO35, + S2MPU05_BUCK1, + S2MPU05_BUCK2, + S2MPU05_BUCK3, + S2MPU05_BUCK4, + S2MPU05_BUCK5, + + S2MPU05_REGULATOR_MAX, +}; + +#define S2MPU05_SW_ENABLE_MASK 0x03 + +#define S2MPU05_ENABLE_TIME_LDO 128 +#define S2MPU05_ENABLE_TIME_BUCK1 110 +#define S2MPU05_ENABLE_TIME_BUCK2 110 +#define S2MPU05_ENABLE_TIME_BUCK3 110 +#define S2MPU05_ENABLE_TIME_BUCK4 150 +#define S2MPU05_ENABLE_TIME_BUCK5 150 + +#define S2MPU05_LDO_MIN1 800000 +#define S2MPU05_LDO_MIN2 1800000 +#define S2MPU05_LDO_MIN3 400000 +#define S2MPU05_LDO_STEP1 12500 +#define S2MPU05_LDO_STEP2 25000 + +#define S2MPU05_BUCK_MIN1 400000 +#define S2MPU05_BUCK_MIN2 600000 +#define S2MPU05_BUCK_STEP1 6250 +#define S2MPU05_BUCK_STEP2 12500 + +#define S2MPU05_RAMP_DELAY 12000 /* uV/uS */ + +#define S2MPU05_ENABLE_SHIFT 6 +#define S2MPU05_ENABLE_MASK (0x03 << S2MPU05_ENABLE_SHIFT) + +#define S2MPU05_LDO_VSEL_MASK 0x3F +#define S2MPU05_BUCK_VSEL_MASK 0xFF +#define S2MPU05_LDO_N_VOLTAGES (S2MPU05_LDO_VSEL_MASK + 1) +#define S2MPU05_BUCK_N_VOLTAGES (S2MPU05_BUCK_VSEL_MASK + 1) + +#define S2MPU05_PMIC_EN_SHIFT 6 + +#endif /* __LINUX_MFD_S2MPU05_H */ -- cgit v1.2.3 From a3996d11f3ab743e6cc4e3529ce9459c2cd27139 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Mar 2025 17:21:50 +0530 Subject: block: protect debugfs attrs using elevator_lock instead of sysfs_lock Currently, the block debugfs attributes (tags, tags_bitmap, sched_tags, and sched_tags_bitmap) are protected using q->sysfs_lock. However, these attributes are updated in multiple scenarios: - During driver probe method - During an elevator switch/update - During an nr_hw_queues update - When writing to the sysfs attribute nr_requests All these update paths (except driver probe method, which doesn't require any protection) are already protected using q->elevator_lock. To ensure consistency and proper synchronization, replace q->sysfs_lock with q->elevator_lock for protecting these debugfs attributes. Signed-off-by: Nilay Shroff Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250313115235.3707600-2-nilay@linux.ibm.com [axboe: some commit message rewording/fixes] Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dcf8fce15e23..8d072042790e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -566,9 +566,9 @@ struct request_queue { * nr_requests and wbt latency, this lock also protects the sysfs attrs * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update * may modify hctx tags, reserved-tags and cpumask, so this lock also - * helps protect the hctx attrs. To ensure proper locking order during - * an elevator or nr_hw_queue update, first freeze the queue, then - * acquire ->elevator_lock. + * helps protect the hctx sysfs/debugfs attrs. To ensure proper locking + * order during an elevator or nr_hw_queue update, first freeze the + * queue, then acquire ->elevator_lock. */ struct mutex elevator_lock; -- cgit v1.2.3 From e4b3cbd840e565484d0ad8d260d27c057466ed17 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 11 Mar 2025 18:18:57 +0100 Subject: firmware: thead: Add AON firmware protocol driver The T-Head TH1520 SoC uses an E902 co-processor running Always-On (AON) firmware to manage power, clock, and other system resources [1]. This patch introduces a driver implementing the AON firmware protocol, allowing the Linux kernel to communicate with the firmware via mailbox channels. Through an RPC-based interface, the kernel can initiate power state transitions, update resource configurations, and perform other AON-related tasks. [1] Link: https://openbeagle.org/beaglev-ahead/beaglev-ahead/-/blob/main/docs/TH1520%20System%20User%20Manual.pdf Signed-off-by: Michal Wilczynski Acked-by: Drew Fustini Link: https://lore.kernel.org/r/20250311171900.1549916-3-m.wilczynski@samsung.com Signed-off-by: Ulf Hansson --- include/linux/firmware/thead/thead,th1520-aon.h | 200 ++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 include/linux/firmware/thead/thead,th1520-aon.h (limited to 'include/linux') diff --git a/include/linux/firmware/thead/thead,th1520-aon.h b/include/linux/firmware/thead/thead,th1520-aon.h new file mode 100644 index 000000000000..dae132b66873 --- /dev/null +++ b/include/linux/firmware/thead/thead,th1520-aon.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021 Alibaba Group Holding Limited. + */ + +#ifndef _THEAD_AON_H +#define _THEAD_AON_H + +#include +#include + +#define AON_RPC_MSG_MAGIC (0xef) +#define TH1520_AON_RPC_VERSION 2 +#define TH1520_AON_RPC_MSG_NUM 7 + +struct th1520_aon_chan; + +enum th1520_aon_rpc_svc { + TH1520_AON_RPC_SVC_UNKNOWN = 0, + TH1520_AON_RPC_SVC_PM = 1, + TH1520_AON_RPC_SVC_MISC = 2, + TH1520_AON_RPC_SVC_AVFS = 3, + TH1520_AON_RPC_SVC_SYS = 4, + TH1520_AON_RPC_SVC_WDG = 5, + TH1520_AON_RPC_SVC_LPM = 6, + TH1520_AON_RPC_SVC_MAX = 0x3F, +}; + +enum th1520_aon_misc_func { + TH1520_AON_MISC_FUNC_UNKNOWN = 0, + TH1520_AON_MISC_FUNC_SET_CONTROL = 1, + TH1520_AON_MISC_FUNC_GET_CONTROL = 2, + TH1520_AON_MISC_FUNC_REGDUMP_CFG = 3, +}; + +enum th1520_aon_wdg_func { + TH1520_AON_WDG_FUNC_UNKNOWN = 0, + TH1520_AON_WDG_FUNC_START = 1, + TH1520_AON_WDG_FUNC_STOP = 2, + TH1520_AON_WDG_FUNC_PING = 3, + TH1520_AON_WDG_FUNC_TIMEOUTSET = 4, + TH1520_AON_WDG_FUNC_RESTART = 5, + TH1520_AON_WDG_FUNC_GET_STATE = 6, + TH1520_AON_WDG_FUNC_POWER_OFF = 7, + TH1520_AON_WDG_FUNC_AON_WDT_ON = 8, + TH1520_AON_WDG_FUNC_AON_WDT_OFF = 9, +}; + +enum th1520_aon_sys_func { + TH1520_AON_SYS_FUNC_UNKNOWN = 0, + TH1520_AON_SYS_FUNC_AON_RESERVE_MEM = 1, +}; + +enum th1520_aon_lpm_func { + TH1520_AON_LPM_FUNC_UNKNOWN = 0, + TH1520_AON_LPM_FUNC_REQUIRE_STR = 1, + TH1520_AON_LPM_FUNC_RESUME_STR = 2, + TH1520_AON_LPM_FUNC_REQUIRE_STD = 3, + TH1520_AON_LPM_FUNC_CPUHP = 4, + TH1520_AON_LPM_FUNC_REGDUMP_CFG = 5, +}; + +enum th1520_aon_pm_func { + TH1520_AON_PM_FUNC_UNKNOWN = 0, + TH1520_AON_PM_FUNC_SET_RESOURCE_REGULATOR = 1, + TH1520_AON_PM_FUNC_GET_RESOURCE_REGULATOR = 2, + TH1520_AON_PM_FUNC_SET_RESOURCE_POWER_MODE = 3, + TH1520_AON_PM_FUNC_PWR_SET = 4, + TH1520_AON_PM_FUNC_PWR_GET = 5, + TH1520_AON_PM_FUNC_CHECK_FAULT = 6, + TH1520_AON_PM_FUNC_GET_TEMPERATURE = 7, +}; + +struct th1520_aon_rpc_msg_hdr { + u8 ver; /* version of msg hdr */ + u8 size; /* msg size ,uinit in bytes,the size includes rpc msg header self */ + u8 svc; /* rpc main service id */ + u8 func; /* rpc sub func id of specific service, sent by caller */ +} __packed __aligned(1); + +struct th1520_aon_rpc_ack_common { + struct th1520_aon_rpc_msg_hdr hdr; + u8 err_code; +} __packed __aligned(1); + +#define RPC_SVC_MSG_TYPE_DATA 0 +#define RPC_SVC_MSG_TYPE_ACK 1 +#define RPC_SVC_MSG_NEED_ACK 0 +#define RPC_SVC_MSG_NO_NEED_ACK 1 + +#define RPC_GET_VER(MESG) ((MESG)->ver) +#define RPC_SET_VER(MESG, VER) ((MESG)->ver = (VER)) +#define RPC_GET_SVC_ID(MESG) ((MESG)->svc & 0x3F) +#define RPC_SET_SVC_ID(MESG, ID) ((MESG)->svc |= 0x3F & (ID)) +#define RPC_GET_SVC_FLAG_MSG_TYPE(MESG) (((MESG)->svc & 0x80) >> 7) +#define RPC_SET_SVC_FLAG_MSG_TYPE(MESG, TYPE) ((MESG)->svc |= (TYPE) << 7) +#define RPC_GET_SVC_FLAG_ACK_TYPE(MESG) (((MESG)->svc & 0x40) >> 6) +#define RPC_SET_SVC_FLAG_ACK_TYPE(MESG, ACK) ((MESG)->svc |= (ACK) << 6) + +#define RPC_SET_BE64(MESG, OFFSET, SET_DATA) \ + do { \ + u8 *data = (u8 *)(MESG); \ + u64 _offset = (OFFSET); \ + u64 _set_data = (SET_DATA); \ + data[_offset + 7] = _set_data & 0xFF; \ + data[_offset + 6] = (_set_data & 0xFF00) >> 8; \ + data[_offset + 5] = (_set_data & 0xFF0000) >> 16; \ + data[_offset + 4] = (_set_data & 0xFF000000) >> 24; \ + data[_offset + 3] = (_set_data & 0xFF00000000) >> 32; \ + data[_offset + 2] = (_set_data & 0xFF0000000000) >> 40; \ + data[_offset + 1] = (_set_data & 0xFF000000000000) >> 48; \ + data[_offset + 0] = (_set_data & 0xFF00000000000000) >> 56; \ + } while (0) + +#define RPC_SET_BE32(MESG, OFFSET, SET_DATA) \ + do { \ + u8 *data = (u8 *)(MESG); \ + u64 _offset = (OFFSET); \ + u64 _set_data = (SET_DATA); \ + data[_offset + 3] = (_set_data) & 0xFF; \ + data[_offset + 2] = (_set_data & 0xFF00) >> 8; \ + data[_offset + 1] = (_set_data & 0xFF0000) >> 16; \ + data[_offset + 0] = (_set_data & 0xFF000000) >> 24; \ + } while (0) + +#define RPC_SET_BE16(MESG, OFFSET, SET_DATA) \ + do { \ + u8 *data = (u8 *)(MESG); \ + u64 _offset = (OFFSET); \ + u64 _set_data = (SET_DATA); \ + data[_offset + 1] = (_set_data) & 0xFF; \ + data[_offset + 0] = (_set_data & 0xFF00) >> 8; \ + } while (0) + +#define RPC_SET_U8(MESG, OFFSET, SET_DATA) \ + do { \ + u8 *data = (u8 *)(MESG); \ + data[OFFSET] = (SET_DATA) & 0xFF; \ + } while (0) + +#define RPC_GET_BE64(MESG, OFFSET, PTR) \ + do { \ + u8 *data = (u8 *)(MESG); \ + u64 _offset = (OFFSET); \ + *(u32 *)(PTR) = \ + (data[_offset + 7] | data[_offset + 6] << 8 | \ + data[_offset + 5] << 16 | data[_offset + 4] << 24 | \ + data[_offset + 3] << 32 | data[_offset + 2] << 40 | \ + data[_offset + 1] << 48 | data[_offset + 0] << 56); \ + } while (0) + +#define RPC_GET_BE32(MESG, OFFSET, PTR) \ + do { \ + u8 *data = (u8 *)(MESG); \ + u64 _offset = (OFFSET); \ + *(u32 *)(PTR) = \ + (data[_offset + 3] | data[_offset + 2] << 8 | \ + data[_offset + 1] << 16 | data[_offset + 0] << 24); \ + } while (0) + +#define RPC_GET_BE16(MESG, OFFSET, PTR) \ + do { \ + u8 *data = (u8 *)(MESG); \ + u64 _offset = (OFFSET); \ + *(u16 *)(PTR) = (data[_offset + 1] | data[_offset + 0] << 8); \ + } while (0) + +#define RPC_GET_U8(MESG, OFFSET, PTR) \ + do { \ + u8 *data = (u8 *)(MESG); \ + *(u8 *)(PTR) = (data[OFFSET]); \ + } while (0) + +/* + * Defines for SC PM Power Mode + */ +#define TH1520_AON_PM_PW_MODE_OFF 0 /* Power off */ +#define TH1520_AON_PM_PW_MODE_STBY 1 /* Power in standby */ +#define TH1520_AON_PM_PW_MODE_LP 2 /* Power in low-power */ +#define TH1520_AON_PM_PW_MODE_ON 3 /* Power on */ + +/* + * Defines for AON power islands + */ +#define TH1520_AON_AUDIO_PD 0 +#define TH1520_AON_VDEC_PD 1 +#define TH1520_AON_NPU_PD 2 +#define TH1520_AON_VENC_PD 3 +#define TH1520_AON_GPU_PD 4 +#define TH1520_AON_DSP0_PD 5 +#define TH1520_AON_DSP1_PD 6 + +struct th1520_aon_chan *th1520_aon_init(struct device *dev); +void th1520_aon_deinit(struct th1520_aon_chan *aon_chan); + +int th1520_aon_call_rpc(struct th1520_aon_chan *aon_chan, void *msg); +int th1520_aon_power_update(struct th1520_aon_chan *aon_chan, u16 rsrc, + bool power_on); + +#endif /* _THEAD_AON_H */ -- cgit v1.2.3 From 6e969ef3d7cff494118205c85a21e05b046ac6c6 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 Jan 2025 19:05:30 +0800 Subject: jbd2: drop JBD2_ABORT_ON_SYNCDATA_ERR Since ext4's data_err=abort mode doesn't depend on JBD2_ABORT_ON_SYNCDATA_ERR anymore, and nobody else uses it, we can drop it and only warn in jbd2 as it used to be long ago. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20250122110533.4116662-7-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 82ef232935c0..cdb648053184 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1382,9 +1382,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) #define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ #define JBD2_BARRIER 0x020 /* Use IDE barriers */ -#define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file - * data write error in ordered - * mode */ #define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on * clean and empty filesystem * logging area */ -- cgit v1.2.3 From 91ce208d7ab7ab7703537a054a0c31cb53bbf302 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Mar 2025 16:03:13 +0200 Subject: spi: Use inclusive language Replace "master" by "[host] controller" in the SPI core code and comments. All the similar to the "slave" by "target [device]" changes. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250313140340.380359-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 8d5c7da39c85..0ba5e49bace4 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -35,7 +35,7 @@ struct spi_offload; struct spi_offload_config; /* - * INTERFACES between SPI master-side drivers and SPI slave protocol handlers, + * INTERFACES between SPI controller-side drivers and SPI target protocol handlers, * and SPI infrastructure. */ extern const struct bus_type spi_bus_type; @@ -130,7 +130,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, struct spi_transfer *xfer); /** - * struct spi_device - Controller side proxy for an SPI slave device + * struct spi_device - Controller side proxy for an SPI target device * @dev: Driver model representation of the device. * @controller: SPI controller used with the device. * @max_speed_hz: Maximum clock rate to be used with this chip @@ -174,7 +174,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @pcpu_statistics: statistics for the spi_device * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * - * A @spi_device is used to interchange data between an SPI slave + * A @spi_device is used to interchange data between an SPI target device * (usually a discrete chip) and CPU memory. * * In @dev, the platform_data is used to hold information about this @@ -388,15 +388,15 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch spi_unregister_driver) /** - * struct spi_controller - interface to SPI master or slave controller + * struct spi_controller - interface to SPI host or target controller * @dev: device interface to this driver * @list: link with the global spi_controller list * @bus_num: board-specific (and often SOC-specific) identifier for a * given SPI controller. * @num_chipselect: chipselects are used to distinguish individual - * SPI slaves, and are numbered from zero to num_chipselects. - * each slave has a chipselect signal, but it's common that not - * every chipselect is connected to a slave. + * SPI targets, and are numbered from zero to num_chipselects. + * each target has a chipselect signal, but it's common that not + * every chipselect is connected to a target. * @dma_alignment: SPI controller constraint on DMA buffers alignment. * @mode_bits: flags understood by this controller driver * @buswidth_override_bits: flags to override for this controller driver @@ -425,9 +425,9 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * must fail if an unrecognized or unsupported mode is requested. * It's always safe to call this unless transfers are pending on * the device whose settings are being modified. - * @set_cs_timing: optional hook for SPI devices to request SPI master + * @set_cs_timing: optional hook for SPI devices to request SPI * controller for configuring specific CS setup time, hold time and inactive - * delay interms of clock counts + * delay in terms of clock counts * @transfer: adds a message to the controller's transfer queue. * @cleanup: frees controller-specific state * @can_dma: determine whether this controller supports DMA @@ -547,7 +547,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * * The driver for an SPI controller manages access to those devices through * a queue of spi_message transactions, copying data between CPU memory and - * an SPI slave device. For each such message it queues, it calls the + * an SPI target device. For each such message it queues, it calls the * message's completion function when the transaction completes. */ struct spi_controller { @@ -597,7 +597,7 @@ struct spi_controller { #define SPI_CONTROLLER_NO_TX BIT(2) /* Can't do buffer write */ #define SPI_CONTROLLER_MUST_RX BIT(3) /* Requires rx */ #define SPI_CONTROLLER_MUST_TX BIT(4) /* Requires tx */ -#define SPI_CONTROLLER_GPIO_SS BIT(5) /* GPIO CS must select slave */ +#define SPI_CONTROLLER_GPIO_SS BIT(5) /* GPIO CS must select target device */ #define SPI_CONTROLLER_SUSPENDED BIT(6) /* Currently suspended */ /* * The spi-controller has multi chip select capability and can @@ -664,7 +664,7 @@ struct spi_controller { * + To a given spi_device, message queueing is pure FIFO * * + The controller's main job is to process its message queue, - * selecting a chip (for masters), then transferring data + * selecting a chip (for controllers), then transferring data * + If there are multiple spi_device children, the i/o queue * arbitration algorithm is unspecified (round robin, FIFO, * priority, reservations, preemption, etc) @@ -832,7 +832,7 @@ void spi_take_timestamp_post(struct spi_controller *ctlr, /* The SPI driver core manages memory for the spi_controller classdev */ extern struct spi_controller *__spi_alloc_controller(struct device *host, - unsigned int size, bool slave); + unsigned int size, bool target); static inline struct spi_controller *spi_alloc_host(struct device *dev, unsigned int size) @@ -851,7 +851,7 @@ static inline struct spi_controller *spi_alloc_target(struct device *dev, struct spi_controller *__devm_spi_alloc_controller(struct device *dev, unsigned int size, - bool slave); + bool target); static inline struct spi_controller *devm_spi_alloc_host(struct device *dev, unsigned int size) @@ -989,12 +989,12 @@ struct spi_res { * purposefully (instead of setting to spi_transfer->len - 1) to denote * that a transfer-level snapshot taken from within the driver may still * be of higher quality. - * @ptp_sts: Pointer to a memory location held by the SPI slave device where a + * @ptp_sts: Pointer to a memory location held by the SPI target device where a * PTP system timestamp structure may lie. If drivers use PIO or their * hardware has some sort of assist for retrieving exact transfer timing, * they can (and should) assert @ptp_sts_supported and populate this * structure using the ptp_read_system_*ts helper functions. - * The timestamp must represent the time at which the SPI slave device has + * The timestamp must represent the time at which the SPI target device has * processed the word, i.e. the "pre" timestamp should be taken before * transmitting the "pre" word, and the "post" timestamp after receiving * transmit confirmation from the controller for the "post" word. @@ -1622,7 +1622,7 @@ struct spi_board_info { * bus_num is board specific and matches the bus_num of some * spi_controller that will probably be registered later. * - * chip_select reflects how this chip is wired to that master; + * chip_select reflects how this chip is wired to that controller; * it's less than num_chipselect. */ u16 bus_num; -- cgit v1.2.3 From ec22493849247d60d595c93573ac3b01534b1965 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 23 Jan 2025 23:50:09 +0800 Subject: jbd2: remove unused h_jdata flag of handle Flag h_jdata is not used, just remove it. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20250123155014.2097920-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index cdb648053184..0c4522154c0a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -459,7 +459,6 @@ struct jbd2_revoke_table_s; * @h_ref: Reference count on this handle. * @h_err: Field for caller's use to track errors through large fs operations. * @h_sync: Flag for sync-on-close. - * @h_jdata: Flag to force data journaling. * @h_reserved: Flag for handle for reserved credits. * @h_aborted: Flag indicating fatal error on handle. * @h_type: For handle statistics. @@ -491,7 +490,6 @@ struct jbd2_journal_handle /* Flags [no locking] */ unsigned int h_sync: 1; - unsigned int h_jdata: 1; unsigned int h_reserved: 1; unsigned int h_aborted: 1; unsigned int h_type: 8; -- cgit v1.2.3 From 9e6d3f9c8a85ed0db0ed1586049321e6b2ac5138 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 23 Jan 2025 23:50:10 +0800 Subject: jbd2: remove unused return value of jbd2_journal_cancel_revoke Remove unused return value of jbd2_journal_cancel_revoke. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20250123155014.2097920-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0c4522154c0a..8018d4aecf1b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1621,7 +1621,7 @@ void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table); extern void jbd2_journal_destroy_revoke(journal_t *); extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); -extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); +extern void jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); extern void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs); -- cgit v1.2.3 From 955fbe0ef19df4197595a98d0906c94025c4beef Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 12 Mar 2025 08:38:50 +0100 Subject: Revert "fsnotify: generate pre-content permission event on page fault" This reverts commit 8392bc2ff8c8bf7c4c5e6dfa71ccd893a3c046f6. In the use case of buffered write whose input buffer is mmapped file on a filesystem with a pre-content mark, the prefaulting of the buffer can happen under the filesystem freeze protection (obtained in vfs_write()) which breaks assumptions of pre-content hook and introduces potential deadlock of HSM handler in userspace with filesystem freezing. Now that we have pre-content hooks at file mmap() time, disable the pre-content event hooks on page fault to avoid the potential deadlock. Reported-by: syzbot+7229071b47908b19d5b7@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/ Fixes: 8392bc2ff8c8 ("fsnotify: generate pre-content permission event on page fault") Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/20250312073852.2123409-5-amir73il@gmail.com --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..8483e09aeb2c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3420,7 +3420,6 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); -extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -- cgit v1.2.3 From 08549ff3e53b9c7bc55724d660ca733041a8bd5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2025 14:03:38 +0100 Subject: cleanup: Provide retain_ptr() In cases where an allocation is consumed by another function, the allocation needs to be retained on success or freed on failure. The code pattern is usually: struct foo *f = kzalloc(sizeof(*f), GFP_KERNEL); struct bar *b; ,,, // Initialize f ... if (ret) goto free; ... bar = bar_create(f); if (!bar) { ret = -ENOMEM; goto free; } ... return 0; free: kfree(f); return ret; This prevents using __free(kfree) on @f because there is no canonical way to tell the cleanup code that the allocation should not be freed. Abusing no_free_ptr() by force ignoring the return value is not really a sensible option either. Provide an explicit macro retain_ptr(), which NULLs the cleanup pointer. That makes it easy to analyze and reason about. Signed-off-by: Thomas Gleixner Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/all/20250313130321.442025758@linutronix.de --- include/linux/cleanup.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index ec00e3f7af2b..6537f8dfe1bb 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -216,6 +216,23 @@ const volatile void * __must_check_fn(const volatile void *val) #define return_ptr(p) return no_free_ptr(p) +/* + * Only for situations where an allocation is handed in to another function + * and consumed by that function on success. + * + * struct foo *f __free(kfree) = kzalloc(sizeof(*f), GFP_KERNEL); + * + * setup(f); + * if (some_condition) + * return -EINVAL; + * .... + * ret = bar(f); + * if (!ret) + * retain_ptr(f); + * return ret; + */ +#define retain_ptr(p) \ + __get_and_null(p, NULL) /* * DEFINE_CLASS(name, type, exit, init, init_args...): -- cgit v1.2.3 From 5c99e0226eccb11738a30e27454157e63dcf2b52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2025 14:03:39 +0100 Subject: genirq/msi: Use lock guards for MSI descriptor locking Provide a lock guard for MSI descriptor locking and update the core code accordingly. No functional change intended. Signed-off-by: Thomas Gleixner Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/all/20250313130321.506045185@linutronix.de --- include/linux/irqdomain.h | 2 ++ include/linux/msi.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e432b6a12a32..d258aa2fc022 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -281,6 +281,8 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) void irq_domain_free_fwnode(struct fwnode_handle *fwnode); +DEFINE_FREE(irq_domain_free_fwnode, struct fwnode_handle *, if (_T) irq_domain_free_fwnode(_T)) + struct irq_domain_chip_generic_info; /** diff --git a/include/linux/msi.h b/include/linux/msi.h index df2dd6355e1c..3246ca633a5c 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -227,6 +227,9 @@ int msi_setup_device_data(struct device *dev); void msi_lock_descs(struct device *dev); void msi_unlock_descs(struct device *dev); +DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, msi_lock_descs(_T->lock), + msi_unlock_descs(_T->lock)); + struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); -- cgit v1.2.3 From 8327df40592103bcc693a99c8cb478c35c7ec7e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2025 14:03:53 +0100 Subject: genirq/msi: Rename msi_[un]lock_descs() Now that all abuse is gone and the legit users are converted to guard(msi_descs_lock), rename the lock functions and document them as internal. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/all/20250313130322.027190131@linutronix.de --- include/linux/msi.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 3246ca633a5c..0c39cafaf604 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -224,11 +224,11 @@ struct msi_dev_domain { int msi_setup_device_data(struct device *dev); -void msi_lock_descs(struct device *dev); -void msi_unlock_descs(struct device *dev); +void __msi_lock_descs(struct device *dev); +void __msi_unlock_descs(struct device *dev); -DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, msi_lock_descs(_T->lock), - msi_unlock_descs(_T->lock)); +DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, __msi_lock_descs(_T->lock), + __msi_unlock_descs(_T->lock)); struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); -- cgit v1.2.3 From a31b4dcf188cc07980376519ff9d8501463c9069 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 4 Mar 2025 14:34:23 +0100 Subject: clk: davinci: remove support for da830 This SoC has some leftover code all over the kernel but no boards are supported anymore. Remove support for da830 from the davinci clock driver. With it: remove the ifdefs around the data structures as the da850 remains the only davinci SoC supported and the only user of this driver. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20250304133423.100884-1-brgl@bgdev.pl Acked-by: David Lechner Signed-off-by: Stephen Boyd --- include/linux/clk/davinci.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk/davinci.h b/include/linux/clk/davinci.h index e1d37451e03f..787a81116b00 100644 --- a/include/linux/clk/davinci.h +++ b/include/linux/clk/davinci.h @@ -12,12 +12,6 @@ #include /* function for registering clocks in early boot */ - -#ifdef CONFIG_ARCH_DAVINCI_DA830 -int da830_pll_init(struct device *dev, void __iomem *base, struct regmap *cfgchip); -#endif -#ifdef CONFIG_ARCH_DAVINCI_DA850 int da850_pll0_init(struct device *dev, void __iomem *base, struct regmap *cfgchip); -#endif #endif /* __LINUX_CLK_DAVINCI_PLL_H___ */ -- cgit v1.2.3 From 65d1f5507ed2c78c64fce40e44e5574a9419eb09 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Sat, 8 Mar 2025 12:09:33 -0800 Subject: zstd: Import upstream v1.5.7 In addition to keeping the kernel's copy of zstd up to date, this update was requested by Intel to expose upstream's APIs that allow QAT to accelerate the LZ match finding stage of Zstd. This patch is imported from the upstream tag v1.5.7-kernel [0], which is signed with upstream's signing key EF8FE99528B52FFD [1]. It was imported from upstream using this command: export ZSTD=/path/to/repo/zstd/ export LINUX=/path/to/repo/linux/ cd "$ZSTD/contrib/linux-kernel" git checkout v1.5.7-kernel make import LINUX="$LINUX" This patch has been tested on x86-64, and has been boot tested with a zstd compressed kernel & initramfs on i386 and aarch64. I benchmarked the patch on x86-64 with gcc-14.2.1 on an Intel i9-9900K by measruing the performance of compressed filesystem reads and writes. Component, Level, Size delta, C. time delta, D. time delta Btrfs , 1, +0.00%, -6.1%, +1.4% Btrfs , 3, +0.00%, -9.8%, +3.0% Btrfs , 5, +0.00%, +1.7%, +1.4% Btrfs , 7, +0.00%, -1.9%, +2.7% Btrfs , 9, +0.00%, -3.4%, +3.7% Btrfs , 15, +0.00%, -0.3%, +3.6% SquashFS , 1, +0.00%, N/A, +1.9% The major changes that impact the kernel use cases for each version are: v1.5.7: https://github.com/facebook/zstd/releases/tag/v1.5.7 * Add zstd_compress_sequences_and_literals() for use by Intel's QAT driver to implement Zstd compression acceleration in the kernel. * Fix an underflow bug in 32-bit builds that can cause data corruption when processing more than 4GB of data with a single `ZSTD_CCtx` object, when an input crosses the 4GB boundry. I don't believe this impacts any current kernel use cases, because the `ZSTD_CCtx` is typically reconstructed between compressions. * Levels 1-4 see 5-10% compression speed improvements for inputs smaller than 128KB. v1.5.6: https://github.com/facebook/zstd/releases/tag/v1.5.6 * Improved compression ratio for the highest compression levels. I don't expect these see much use however, due to their slow speeds. v1.5.5: https://github.com/facebook/zstd/releases/tag/v1.5.5 * Fix a rare corruption bug that can trigger on levels 13 and above. * Improve compression speed of levels 5-11 on incompressible data. v1.5.4: https://github.com/facebook/zstd/releases/tag/v1.5.4 * Improve copmression speed of levels 5-11 on ARM. * Improve dictionary compression speed. Signed-off-by: Nick Terrell --- include/linux/zstd.h | 87 +++- include/linux/zstd_errors.h | 30 +- include/linux/zstd_lib.h | 1123 +++++++++++++++++++++++++++++++++---------- 3 files changed, 968 insertions(+), 272 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zstd.h b/include/linux/zstd.h index b2c7cf310c8f..2f2a3c8b8a33 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ /* - * Copyright (c) Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -160,7 +160,6 @@ typedef ZSTD_parameters zstd_parameters; zstd_parameters zstd_get_params(int level, unsigned long long estimated_src_size); - /** * zstd_get_cparams() - returns zstd_compression_parameters for selected level * @level: The compression level @@ -173,9 +172,20 @@ zstd_parameters zstd_get_params(int level, zstd_compression_parameters zstd_get_cparams(int level, unsigned long long estimated_src_size, size_t dict_size); -/* ====== Single-pass Compression ====== */ - typedef ZSTD_CCtx zstd_cctx; +typedef ZSTD_cParameter zstd_cparameter; + +/** + * zstd_cctx_set_param() - sets a compression parameter + * @cctx: The context. Must have been initialized with zstd_init_cctx(). + * @param: The parameter to set. + * @value: The value to set the parameter to. + * + * Return: Zero or an error, which can be checked using zstd_is_error(). + */ +size_t zstd_cctx_set_param(zstd_cctx *cctx, zstd_cparameter param, int value); + +/* ====== Single-pass Compression ====== */ /** * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx @@ -190,6 +200,20 @@ typedef ZSTD_CCtx zstd_cctx; */ size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters); +/** + * zstd_cctx_workspace_bound_with_ext_seq_prod() - max memory needed to + * initialize a zstd_cctx when using the block-level external sequence + * producer API. + * @parameters: The compression parameters to be used. + * + * If multiple compression parameters might be used, the caller must call + * this function for each set of parameters and use the maximum size. + * + * Return: A lower bound on the size of the workspace that is passed to + * zstd_init_cctx(). + */ +size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *parameters); + /** * zstd_init_cctx() - initialize a zstd compression context * @workspace: The workspace to emplace the context into. It must outlive @@ -424,6 +448,16 @@ typedef ZSTD_CStream zstd_cstream; */ size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams); +/** + * zstd_cstream_workspace_bound_with_ext_seq_prod() - memory needed to initialize + * a zstd_cstream when using the block-level external sequence producer API. + * @cparams: The compression parameters to be used for compression. + * + * Return: A lower bound on the size of the workspace that is passed to + * zstd_init_cstream(). + */ +size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *cparams); + /** * zstd_init_cstream() - initialize a zstd streaming compression context * @parameters The zstd parameters to use for compression. @@ -583,6 +617,18 @@ size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output, */ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); +/** + * zstd_register_sequence_producer() - exposes the zstd library function + * ZSTD_registerSequenceProducer(). This is used for the block-level external + * sequence producer API. See upstream zstd.h for detailed documentation. + */ +typedef ZSTD_sequenceProducer_F zstd_sequence_producer_f; +void zstd_register_sequence_producer( + zstd_cctx *cctx, + void* sequence_producer_state, + zstd_sequence_producer_f sequence_producer +); + /** * struct zstd_frame_params - zstd frame parameters stored in the frame header * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not @@ -596,7 +642,7 @@ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); * * See zstd_lib.h. */ -typedef ZSTD_frameHeader zstd_frame_header; +typedef ZSTD_FrameHeader zstd_frame_header; /** * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame @@ -611,4 +657,35 @@ typedef ZSTD_frameHeader zstd_frame_header; size_t zstd_get_frame_header(zstd_frame_header *params, const void *src, size_t src_size); +/** + * struct zstd_sequence - a sequence of literals or a match + * + * @offset: The offset of the match + * @litLength: The literal length of the sequence + * @matchLength: The match length of the sequence + * @rep: Represents which repeat offset is used + */ +typedef ZSTD_Sequence zstd_sequence; + +/** + * zstd_compress_sequences_and_literals() - compress an array of zstd_sequence and literals + * + * @cctx: The zstd compression context. + * @dst: The buffer to compress the data into. + * @dst_capacity: The size of the destination buffer. + * @in_seqs: The array of zstd_sequence to compress. + * @in_seqs_size: The number of sequences in in_seqs. + * @literals: The literals associated to the sequences to be compressed. + * @lit_size: The size of the literals in the literals buffer. + * @lit_capacity: The size of the literals buffer. + * @decompressed_size: The size of the input data + * + * Return: The compressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity, + const zstd_sequence *in_seqs, size_t in_seqs_size, + const void* literals, size_t lit_size, size_t lit_capacity, + size_t decompressed_size); + #endif /* LINUX_ZSTD_H */ diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h index 58b6dd45a969..c307fb011132 100644 --- a/include/linux/zstd_errors.h +++ b/include/linux/zstd_errors.h @@ -1,5 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ /* - * Copyright (c) Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -12,13 +13,18 @@ #define ZSTD_ERRORS_H_398273423 -/*===== dependency =====*/ -#include /* size_t */ +/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#define ZSTDERRORLIB_VISIBLE +#ifndef ZSTDERRORLIB_HIDDEN +# if (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) +# else +# define ZSTDERRORLIB_HIDDEN +# endif +#endif -/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ -#define ZSTDERRORLIB_VISIBILITY -#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY +#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE /*-********************************************* * Error codes list @@ -43,14 +49,18 @@ typedef enum { ZSTD_error_frameParameter_windowTooLarge = 16, ZSTD_error_corruption_detected = 20, ZSTD_error_checksum_wrong = 22, + ZSTD_error_literals_headerWrong = 24, ZSTD_error_dictionary_corrupted = 30, ZSTD_error_dictionary_wrong = 32, ZSTD_error_dictionaryCreation_failed = 34, ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_combination_unsupported = 41, ZSTD_error_parameter_outOfBound = 42, ZSTD_error_tableLog_tooLarge = 44, ZSTD_error_maxSymbolValue_tooLarge = 46, ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_cannotProduce_uncompressedBlock = 49, + ZSTD_error_stabilityCondition_notRespected = 50, ZSTD_error_stage_wrong = 60, ZSTD_error_init_missing = 62, ZSTD_error_memory_allocation = 64, @@ -58,18 +68,18 @@ typedef enum { ZSTD_error_dstSize_tooSmall = 70, ZSTD_error_srcSize_wrong = 72, ZSTD_error_dstBuffer_null = 74, + ZSTD_error_noForwardProgress_destFull = 80, + ZSTD_error_noForwardProgress_inputEmpty = 82, /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ ZSTD_error_frameIndex_tooLarge = 100, ZSTD_error_seekableIO = 102, ZSTD_error_dstBuffer_wrong = 104, ZSTD_error_srcBuffer_wrong = 105, + ZSTD_error_sequenceProducer_failed = 106, + ZSTD_error_externalSequences_invalid = 107, ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ } ZSTD_ErrorCode; -/*! ZSTD_getErrorCode() : - convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, - which can be used to compare with enum list published above */ -ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h index 79d55465d5c1..e295d4125dde 100644 --- a/include/linux/zstd_lib.h +++ b/include/linux/zstd_lib.h @@ -1,5 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ /* - * Copyright (c) Yann Collet, Facebook, Inc. + * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,23 +12,47 @@ #ifndef ZSTD_H_235446 #define ZSTD_H_235446 -/* ====== Dependency ======*/ -#include /* INT_MAX */ + +/* ====== Dependencies ======*/ #include /* size_t */ +#include /* list of errors */ +#if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#include /* INT_MAX */ +#endif /* ZSTD_STATIC_LINKING_ONLY */ + /* ===== ZSTDLIB_API : control library symbols visibility ===== */ -#ifndef ZSTDLIB_VISIBLE +#define ZSTDLIB_VISIBLE + +#ifndef ZSTDLIB_HIDDEN # if (__GNUC__ >= 4) && !defined(__MINGW32__) -# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) # else -# define ZSTDLIB_VISIBLE # define ZSTDLIB_HIDDEN # endif #endif + #define ZSTDLIB_API ZSTDLIB_VISIBLE +/* Deprecation warnings : + * Should these warnings be a problem, it is generally possible to disable them, + * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. + * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. + */ +#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ +#else +# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) || defined(__IAR_SYSTEMS_ICC__) +# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) +# elif (__GNUC__ >= 3) +# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) +# else +# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +# define ZSTD_DEPRECATED(message) +# endif +#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ + /* ***************************************************************************** Introduction @@ -65,7 +90,7 @@ /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 #define ZSTD_VERSION_MINOR 5 -#define ZSTD_VERSION_RELEASE 2 +#define ZSTD_VERSION_RELEASE 7 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) /*! ZSTD_versionNumber() : @@ -103,11 +128,12 @@ ZSTDLIB_API const char* ZSTD_versionString(void); /* ************************************* -* Simple API +* Simple Core API ***************************************/ /*! ZSTD_compress() : * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. - * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have + * enough space to successfully compress the data. * @return : compressed size written into `dst` (<= `dstCapacity), * or an error code if it fails (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, @@ -115,47 +141,55 @@ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, int compressionLevel); /*! ZSTD_decompress() : - * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. - * `dstCapacity` is an upper bound of originalSize to regenerate. - * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. - * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), - * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ + * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. + * Multiple compressed frames can be decompressed at once with this method. + * The result will be the concatenation of all decompressed frames, back to back. + * `dstCapacity` is an upper bound of originalSize to regenerate. + * First frame's decompressed size can be extracted using ZSTD_getFrameContentSize(). + * If maximum upper bound isn't known, prefer using streaming mode to decompress data. + * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, const void* src, size_t compressedSize); + +/*====== Decompression helper functions ======*/ + /*! ZSTD_getFrameContentSize() : requires v1.3.0+ - * `src` should point to the start of a ZSTD encoded frame. - * `srcSize` must be at least as large as the frame header. - * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. - * @return : - decompressed size of `src` frame content, if known - * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined - * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) - * note 1 : a 0 return value means the frame is valid but "empty". - * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. - * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. - * In which case, it's necessary to use streaming mode to decompress data. - * Optionally, application can rely on some implicit limit, - * as ZSTD_decompress() only needs an upper bound of decompressed size. - * (For example, data could be necessarily cut into blocks <= 16 KB). - * note 3 : decompressed size is always present when compression is completed using single-pass functions, - * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). - * note 4 : decompressed size can be very large (64-bits value), - * potentially larger than what local system can handle as a single memory segment. - * In which case, it's necessary to use streaming mode to decompress data. - * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. - * Always ensure return value fits within application's authorized limits. - * Each application can set its own limits. - * note 6 : This function replaces ZSTD_getDecompressedSize() */ + * `src` should point to the start of a ZSTD encoded frame. + * `srcSize` must be at least as large as the frame header. + * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. + * @return : - decompressed size of `src` frame content, if known + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) + * note 1 : a 0 return value means the frame is valid but "empty". + * When invoking this method on a skippable frame, it will return 0. + * note 2 : decompressed size is an optional field, it may not be present (typically in streaming mode). + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * Optionally, application can rely on some implicit limit, + * as ZSTD_decompress() only needs an upper bound of decompressed size. + * (For example, data could be necessarily cut into blocks <= 16 KB). + * note 3 : decompressed size is always present when compression is completed using single-pass functions, + * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). + * note 4 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure return value fits within application's authorized limits. + * Each application can set its own limits. + * note 6 : This function replaces ZSTD_getDecompressedSize() */ #define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) #define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); -/*! ZSTD_getDecompressedSize() : - * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). +/*! ZSTD_getDecompressedSize() (obsolete): + * This function is now obsolete, in favor of ZSTD_getFrameContentSize(). * Both functions work the same way, but ZSTD_getDecompressedSize() blends * "empty", "unknown" and "error" results to the same return value (0), * while ZSTD_getFrameContentSize() gives them separate return values. * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ @@ -163,18 +197,50 @@ ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t * `srcSize` must be >= first frame size * @return : the compressed size of the first frame starting at `src`, * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, - * or an error code if input is invalid */ + * or an error code if input is invalid + * Note 1: this method is called _find*() because it's not enough to read the header, + * it may have to scan through the frame's content, to reach its end. + * Note 2: this method also works with Skippable Frames. In which case, + * it returns the size of the complete skippable frame, + * which is always equal to its content size + 8 bytes for headers. */ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); -/*====== Helper functions ======*/ -#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ -ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ -ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ -ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ -ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ -ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ -ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ +/*====== Compression helper functions ======*/ + +/*! ZSTD_compressBound() : + * maximum compressed size in worst case single-pass scenario. + * When invoking `ZSTD_compress()`, or any other one-pass compression function, + * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) + * as it eliminates one potential failure scenario, + * aka not enough room in dst buffer to write the compressed frame. + * Note : ZSTD_compressBound() itself can fail, if @srcSize >= ZSTD_MAX_INPUT_SIZE . + * In which case, ZSTD_compressBound() will return an error code + * which can be tested using ZSTD_isError(). + * + * ZSTD_COMPRESSBOUND() : + * same as ZSTD_compressBound(), but as a macro. + * It can be used to produce constants, which can be useful for static allocation, + * for example to size a static array on stack. + * Will produce constant value 0 if srcSize is too large. + */ +#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) +#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ + + +/*====== Error helper functions ======*/ +/* ZSTD_isError() : + * Most ZSTD_* functions returning a size_t value can be tested for error, + * using ZSTD_isError(). + * @return 1 if error, 0 otherwise + */ +ZSTDLIB_API unsigned ZSTD_isError(size_t result); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); /* convert a result into an error code, which can be compared to error enum list */ +ZSTDLIB_API const char* ZSTD_getErrorName(size_t result); /*!< provides readable string from a function result */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ +ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ /* ************************************* @@ -182,25 +248,25 @@ ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compres ***************************************/ /*= Compression context * When compressing many times, - * it is recommended to allocate a context just once, - * and re-use it for each successive compression operation. - * This will make workload friendlier for system's memory. + * it is recommended to allocate a compression context just once, + * and reuse it for each successive compression operation. + * This will make the workload easier for system's memory. * Note : re-using context is just a speed / resource optimization. * It doesn't change the compression ratio, which remains identical. - * Note 2 : In multi-threaded environments, - * use one different context per thread for parallel execution. + * Note 2: For parallel execution in multi-threaded environments, + * use one different context per thread . */ typedef struct ZSTD_CCtx_s ZSTD_CCtx; ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); -ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer */ +ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* compatible with NULL pointer */ /*! ZSTD_compressCCtx() : * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. - * Important : in order to behave similarly to `ZSTD_compress()`, - * this function compresses at requested compression level, - * __ignoring any other parameter__ . + * Important : in order to mirror `ZSTD_compress()` behavior, + * this function compresses at the requested compression level, + * __ignoring any other advanced parameter__ . * If any advanced parameter was set using the advanced API, - * they will all be reset. Only `compressionLevel` remains. + * they will all be reset. Only @compressionLevel remains. */ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, @@ -210,7 +276,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, /*= Decompression context * When decompressing many times, * it is recommended to allocate a context only once, - * and re-use it for each successive compression operation. + * and reuse it for each successive compression operation. * This will make workload friendlier for system's memory. * Use one context per thread for parallel execution. */ typedef struct ZSTD_DCtx_s ZSTD_DCtx; @@ -220,7 +286,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer * /*! ZSTD_decompressDCtx() : * Same as ZSTD_decompress(), * requires an allocated ZSTD_DCtx. - * Compatible with sticky parameters. + * Compatible with sticky parameters (see below). */ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, @@ -236,12 +302,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, * using ZSTD_CCtx_set*() functions. * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! - * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . + * __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ . * * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). * * This API supersedes all other "advanced" API entry points in the experimental section. - * In the future, we expect to remove from experimental API entry points which are redundant with this API. + * In the future, we expect to remove API entry points from experimental which are redundant with this API. */ @@ -324,6 +390,19 @@ typedef enum { * The higher the value of selected strategy, the more complex it is, * resulting in stronger and slower compression. * Special: value 0 means "use default strategy". */ + + ZSTD_c_targetCBlockSize=130, /* v1.5.6+ + * Attempts to fit compressed block size into approximately targetCBlockSize. + * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. + * Note that it's not a guarantee, just a convergence target (default:0). + * No target when targetCBlockSize == 0. + * This is helpful in low bandwidth streaming environments to improve end-to-end latency, + * when a client can make use of partial documents (a prominent example being Chrome). + * Note: this parameter is stable since v1.5.6. + * It was present as an experimental parameter in earlier versions, + * but it's not recommended using it with earlier library versions + * due to massive performance regressions. + */ /* LDM mode parameters */ ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. * This parameter is designed to improve compression ratio @@ -403,15 +482,18 @@ typedef enum { * ZSTD_c_forceMaxWindow * ZSTD_c_forceAttachDict * ZSTD_c_literalCompressionMode - * ZSTD_c_targetCBlockSize * ZSTD_c_srcSizeHint * ZSTD_c_enableDedicatedDictSearch * ZSTD_c_stableInBuffer * ZSTD_c_stableOutBuffer * ZSTD_c_blockDelimiters * ZSTD_c_validateSequences - * ZSTD_c_useBlockSplitter + * ZSTD_c_blockSplitterLevel + * ZSTD_c_splitAfterSequences * ZSTD_c_useRowMatchFinder + * ZSTD_c_prefetchCDictTables + * ZSTD_c_enableSeqProducerFallback + * ZSTD_c_maxBlockSize * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. @@ -421,7 +503,7 @@ typedef enum { ZSTD_c_experimentalParam3=1000, ZSTD_c_experimentalParam4=1001, ZSTD_c_experimentalParam5=1002, - ZSTD_c_experimentalParam6=1003, + /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */ ZSTD_c_experimentalParam7=1004, ZSTD_c_experimentalParam8=1005, ZSTD_c_experimentalParam9=1006, @@ -430,7 +512,12 @@ typedef enum { ZSTD_c_experimentalParam12=1009, ZSTD_c_experimentalParam13=1010, ZSTD_c_experimentalParam14=1011, - ZSTD_c_experimentalParam15=1012 + ZSTD_c_experimentalParam15=1012, + ZSTD_c_experimentalParam16=1013, + ZSTD_c_experimentalParam17=1014, + ZSTD_c_experimentalParam18=1015, + ZSTD_c_experimentalParam19=1016, + ZSTD_c_experimentalParam20=1017 } ZSTD_cParameter; typedef struct { @@ -493,7 +580,7 @@ typedef enum { * They will be used to compress next frame. * Resetting session never fails. * - The parameters : changes all parameters back to "default". - * This removes any reference to any dictionary too. + * This also removes any reference to any dictionary or external sequence producer. * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) * - Both : similar to resetting the session, followed by resetting parameters. @@ -502,11 +589,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); /*! ZSTD_compress2() : * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + * (note that this entry point doesn't even expose a compression level parameter). * ZSTD_compress2() always starts a new frame. * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() * - The function is always blocking, returns when compression is completed. - * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have + * enough space to successfully compress the data, though it is possible it fails for other reasons. * @return : compressed size written into `dst` (<= `dstCapacity), * or an error code if it fails (which can be tested using ZSTD_isError()). */ @@ -543,13 +632,17 @@ typedef enum { * ZSTD_d_stableOutBuffer * ZSTD_d_forceIgnoreChecksum * ZSTD_d_refMultipleDDicts + * ZSTD_d_disableHuffmanAssembly + * ZSTD_d_maxBlockSize * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly */ ZSTD_d_experimentalParam1=1000, ZSTD_d_experimentalParam2=1001, ZSTD_d_experimentalParam3=1002, - ZSTD_d_experimentalParam4=1003 + ZSTD_d_experimentalParam4=1003, + ZSTD_d_experimentalParam5=1004, + ZSTD_d_experimentalParam6=1005 } ZSTD_dParameter; @@ -604,14 +697,14 @@ typedef struct ZSTD_outBuffer_s { * A ZSTD_CStream object is required to track streaming operation. * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. -* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. +* It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. * * For parallel execution, use one separate ZSTD_CStream per thread. * * note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. * * Parameters are sticky : when starting a new compression on the same context, -* it will re-use the same sticky parameters as previous compression session. +* it will reuse the same sticky parameters as previous compression session. * When in doubt, it's recommended to fully initialize the context before usage. * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to @@ -700,6 +793,11 @@ typedef enum { * only ZSTD_e_end or ZSTD_e_flush operations are allowed. * Before starting a new compression job, or changing compression parameters, * it is required to fully flush internal buffers. + * - note: if an operation ends with an error, it may leave @cctx in an undefined state. + * Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state. + * In order to be re-employed after an error, a state must be reset, + * which can be done explicitly (ZSTD_CCtx_reset()), + * or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx()) */ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, ZSTD_outBuffer* output, @@ -728,8 +826,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output * This following is a legacy streaming API, available since v1.0+ . * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). * It is redundant, but remains fully supported. - * Streaming in combination with advanced parameters and dictionary compression - * can only be used through the new API. ******************************************************************************/ /*! @@ -738,6 +834,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * + * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API + * to compress with a dictionary. */ ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); /*! @@ -758,7 +857,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); * * A ZSTD_DStream object is required to track streaming operations. * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. -* ZSTD_DStream objects can be re-used multiple times. +* ZSTD_DStream objects can be re-employed multiple times. * * Use ZSTD_initDStream() to start a new decompression operation. * @return : recommended first input size @@ -768,16 +867,21 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); * The function will update both `pos` fields. * If `input.pos < input.size`, some input has not been consumed. * It's up to the caller to present again remaining data. +* * The function tries to flush all data decoded immediately, respecting output buffer size. * If `output.pos < output.size`, decoder has flushed everything it could. -* But if `output.pos == output.size`, there might be some data left within internal buffers., +* +* However, when `output.pos == output.size`, it's more difficult to know. +* If @return > 0, the frame is not complete, meaning +* either there is still some data left to flush within internal buffers, +* or there is more input to read to complete the frame (or both). * In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. * Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. * @return : 0 when a frame is completely decoded and fully flushed, * or an error code, which can be tested using ZSTD_isError(), * or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : * the return value is a suggested next input size (just a hint for better latency) -* that will never request more than the remaining frame size. +* that will never request more than the remaining content of the compressed frame. * *******************************************************************************/ typedef ZSTD_DCtx ZSTD_DStream; /*< DCtx and DStream are now effectively same object (>= v1.3.0) */ @@ -788,13 +892,38 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer /*===== Streaming decompression functions =====*/ -/* This function is redundant with the advanced API and equivalent to: +/*! ZSTD_initDStream() : + * Initialize/reset DStream state for new decompression operation. + * Call before new decompression operation using same DStream. * + * Note : This function is redundant with the advanced API and equivalent to: * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); * ZSTD_DCtx_refDDict(zds, NULL); */ ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); +/*! ZSTD_decompressStream() : + * Streaming decompression function. + * Call repetitively to consume full input updating it as necessary. + * Function will update both input and output `pos` fields exposing current state via these fields: + * - `input.pos < input.size`, some input remaining and caller should provide remaining input + * on the next call. + * - `output.pos < output.size`, decoder flushed internal output buffer. + * - `output.pos == output.size`, unflushed data potentially present in the internal buffers, + * check ZSTD_decompressStream() @return value, + * if > 0, invoke it again to flush remaining data to output. + * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. + * + * @return : 0 when a frame is completely decoded and fully flushed, + * or an error code, which can be tested using ZSTD_isError(), + * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. + * + * Note: when an operation returns with an error code, the @zds state may be left in undefined state. + * It's UB to invoke `ZSTD_decompressStream()` on such a state. + * In order to re-use such a state, it must be first reset, + * which can be done explicitly (`ZSTD_DCtx_reset()`), + * or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`) + */ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ @@ -913,7 +1042,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); * If @return == 0, the dictID could not be decoded. * This could for one of the following reasons : * - The frame does not require a dictionary to be decoded (most common case). - * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. * Note : this use case also happens when using a non-conformant dictionary. * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). * - This is not a Zstandard frame. @@ -925,9 +1054,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); * Advanced dictionary and prefix API (Requires v1.4.0+) * * This API allows dictionaries to be used with ZSTD_compress2(), - * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and - * only reset with the context is reset with ZSTD_reset_parameters or - * ZSTD_reset_session_and_parameters. Prefixes are single-use. + * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). + * Dictionaries are sticky, they remain valid when same context is reused, + * they only reset when the context is reset + * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. + * In contrast, Prefixes are single-use. ******************************************************************************/ @@ -937,8 +1068,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, * meaning "return to no-dictionary mode". - * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. - * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). + * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, + * until parameters are reset, a new dictionary is loaded, or the dictionary + * is explicitly invalidated by loading a NULL dictionary. * Note 2 : Loading a dictionary involves building tables. * It's also a CPU consuming operation, with non-negligible impact on latency. * Tables are dependent on compression parameters, and for this reason, @@ -947,11 +1079,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. * In such a case, dictionary buffer must outlive its users. * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() - * to precisely select how dictionary content must be interpreted. */ + * to precisely select how dictionary content must be interpreted. + * Note 5 : This method does not benefit from LDM (long distance mode). + * If you want to employ LDM on some large dictionary content, + * prefer employing ZSTD_CCtx_refPrefix() described below. + */ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ - * Reference a prepared dictionary, to be used for all next compressed frames. + * Reference a prepared dictionary, to be used for all future compressed frames. * Note that compression parameters are enforced from within CDict, * and supersede any compression parameter previously set within CCtx. * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. @@ -970,6 +1106,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); * Decompression will need same prefix to properly regenerate data. * Compressing with a prefix is similar in outcome as performing a diff and compressing it, * but performs much faster, especially during decompression (compression speed is tunable with compression level). + * This method is compatible with LDM (long distance mode). * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary * Note 1 : Prefix buffer is referenced. It **must** outlive compression. @@ -986,9 +1123,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize); /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ - * Create an internal DDict from dict buffer, - * to be used to decompress next frames. - * The dictionary remains valid for all future frames, until explicitly invalidated. + * Create an internal DDict from dict buffer, to be used to decompress all future frames. + * The dictionary remains valid for all future frames, until explicitly invalidated, or + * a new dictionary is loaded. * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, * meaning "return to no-dictionary mode". @@ -1012,9 +1149,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s * The memory for the table is allocated on the first call to refDDict, and can be * freed with ZSTD_freeDCtx(). * + * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary + * will be managed, and referencing a dictionary effectively "discards" any previous one. + * * @result : 0, or an error code (which can be tested with ZSTD_isError()). - * Note 1 : Currently, only one dictionary can be managed. - * Referencing a new dictionary effectively "discards" any previous one. * Special: referencing a NULL DDict means "return to no-dictionary mode". * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. */ @@ -1051,6 +1189,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #endif /* ZSTD_H_235446 */ @@ -1066,29 +1205,12 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + /* This can be overridden externally to hide static symbols. */ #ifndef ZSTDLIB_STATIC_API #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE #endif -/* Deprecation warnings : - * Should these warnings be a problem, it is generally possible to disable them, - * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. - * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. - */ -#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS -# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ -#else -# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) -# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) -# elif (__GNUC__ >= 3) -# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) -# else -# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") -# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API -# endif -#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ - /* ************************************************************************************** * experimental API (static linking only) **************************************************************************************** @@ -1123,6 +1245,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ #define ZSTD_STRATEGY_MIN ZSTD_fast #define ZSTD_STRATEGY_MAX ZSTD_btultra2 +#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ #define ZSTD_OVERLAPLOG_MIN 0 @@ -1146,7 +1269,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) /* Advanced parameter bounds */ -#define ZSTD_TARGETCBLOCKSIZE_MIN 64 +#define ZSTD_TARGETCBLOCKSIZE_MIN 1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */ #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX #define ZSTD_SRCSIZEHINT_MIN 0 #define ZSTD_SRCSIZEHINT_MAX INT_MAX @@ -1188,7 +1311,7 @@ typedef struct { * * Note: This field is optional. ZSTD_generateSequences() will calculate the value of * 'rep', but repeat offsets do not necessarily need to be calculated from an external - * sequence provider's perspective. For example, ZSTD_compressSequences() does not + * sequence provider perspective. For example, ZSTD_compressSequences() does not * use this 'rep' field at all (as of now). */ } ZSTD_Sequence; @@ -1293,17 +1416,18 @@ typedef enum { } ZSTD_literalCompressionMode_e; typedef enum { - /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final - * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable - * or ZSTD_ps_disable allow for a force enable/disable the feature. + /* Note: This enum controls features which are conditionally beneficial. + * Zstd can take a decision on whether or not to enable the feature (ZSTD_ps_auto), + * but setting the switch to ZSTD_ps_enable or ZSTD_ps_disable force enable/disable the feature. */ ZSTD_ps_auto = 0, /* Let the library automatically determine whether the feature shall be enabled */ ZSTD_ps_enable = 1, /* Force-enable the feature */ ZSTD_ps_disable = 2 /* Do not use the feature */ -} ZSTD_paramSwitch_e; +} ZSTD_ParamSwitch_e; +#define ZSTD_paramSwitch_e ZSTD_ParamSwitch_e /* old name */ /* ************************************* -* Frame size functions +* Frame header and size functions ***************************************/ /*! ZSTD_findDecompressedSize() : @@ -1345,34 +1469,130 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void* src, ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); /*! ZSTD_frameHeaderSize() : - * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * srcSize must be large enough, aka >= ZSTD_FRAMEHEADERSIZE_PREFIX. * @return : size of the Frame Header, * or an error code (if srcSize is too small) */ ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); +typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_FrameType_e; +#define ZSTD_frameType_e ZSTD_FrameType_e /* old name */ +typedef struct { + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; + ZSTD_FrameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; + unsigned dictID; /* for ZSTD_skippableFrame, contains the skippable magic variant [0-15] */ + unsigned checksumFlag; + unsigned _reserved1; + unsigned _reserved2; +} ZSTD_FrameHeader; +#define ZSTD_frameHeader ZSTD_FrameHeader /* old name */ + +/*! ZSTD_getFrameHeader() : + * decode Frame Header into `zfhPtr`, or requires larger `srcSize`. + * @return : 0 => header is complete, `zfhPtr` is correctly filled, + * >0 => `srcSize` is too small, @return value is the wanted `srcSize` amount, `zfhPtr` is not filled, + * or an error code, which can be tested using ZSTD_isError() */ +ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize); +/*! ZSTD_getFrameHeader_advanced() : + * same as ZSTD_getFrameHeader(), + * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + +/*! ZSTD_decompressionMargin() : + * Zstd supports in-place decompression, where the input and output buffers overlap. + * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, + * and the input buffer must be at the end of the output buffer. + * + * _______________________ Output Buffer ________________________ + * | | + * | ____ Input Buffer ____| + * | | | + * v v v + * |---------------------------------------|-----------|----------| + * ^ ^ ^ + * |___________________ Output_Size ___________________|_ Margin _| + * + * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). + * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or + * ZSTD_decompressDCtx(). + * NOTE: This function supports multi-frame input. + * + * @param src The compressed frame(s) + * @param srcSize The size of the compressed frame(s) + * @returns The decompression margin or an error that can be checked with ZSTD_isError(). + */ +ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); + +/*! ZSTD_DECOMPRESS_MARGIN() : + * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from + * the compressed frame, compute it from the original size and the blockSizeLog. + * See ZSTD_decompressionMargin() for details. + * + * WARNING: This macro does not support multi-frame input, the input must be a single + * zstd frame. If you need that support use the function, or implement it yourself. + * + * @param originalSize The original uncompressed size of the data. + * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). + * Unless you explicitly set the windowLog smaller than + * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. + */ +#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ + ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ + 4 /* checksum */ + \ + ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ + (blockSize) /* One block of margin */ \ + )) + typedef enum { - ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ - ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ -} ZSTD_sequenceFormat_e; + ZSTD_sf_noBlockDelimiters = 0, /* ZSTD_Sequence[] has no block delimiters, just sequences */ + ZSTD_sf_explicitBlockDelimiters = 1 /* ZSTD_Sequence[] contains explicit block delimiters */ +} ZSTD_SequenceFormat_e; +#define ZSTD_sequenceFormat_e ZSTD_SequenceFormat_e /* old name */ + +/*! ZSTD_sequenceBound() : + * `srcSize` : size of the input buffer + * @return : upper-bound for the number of sequences that can be generated + * from a buffer of srcSize bytes + * + * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). + */ +ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); /*! ZSTD_generateSequences() : - * Generate sequences using ZSTD_compress2, given a source buffer. + * WARNING: This function is meant for debugging and informational purposes ONLY! + * Its implementation is flawed, and it will be deleted in a future version. + * It is not guaranteed to succeed, as there are several cases where it will give + * up and fail. You should NOT use this function in production code. + * + * This function is deprecated, and will be removed in a future version. + * + * Generate sequences using ZSTD_compress2(), given a source buffer. + * + * @param zc The compression context to be used for ZSTD_compress2(). Set any + * compression parameters you need on this context. + * @param outSeqs The output sequences buffer of size @p outSeqsSize + * @param outSeqsCapacity The size of the output sequences buffer. + * ZSTD_sequenceBound(srcSize) is an upper bound on the number + * of sequences that can be generated. + * @param src The source buffer to generate sequences from of size @p srcSize. + * @param srcSize The size of the source buffer. * * Each block will end with a dummy sequence * with offset == 0, matchLength == 0, and litLength == length of last literals. * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) * simply acts as a block delimiter. * - * zc can be used to insert custom compression params. - * This function invokes ZSTD_compress2 - * - * The output of this function can be fed into ZSTD_compressSequences() with CCtx - * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters - * @return : number of sequences generated + * @returns The number of sequences generated, necessarily less than + * ZSTD_sequenceBound(srcSize), or an error code that can be checked + * with ZSTD_isError(). */ - -ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - size_t outSeqsSize, const void* src, size_t srcSize); +ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") +ZSTDLIB_STATIC_API size_t +ZSTD_generateSequences(ZSTD_CCtx* zc, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize); /*! ZSTD_mergeBlockDelimiters() : * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals @@ -1388,8 +1608,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); /*! ZSTD_compressSequences() : - * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. - * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) + * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. + * @src contains the entire input (not just the literals). + * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals + * If a dictionary is included, then the cctx should reference the dict (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.). * The entire source is compressed into a single frame. * * The compression behavior changes based on cctx params. In particular: @@ -1398,11 +1620,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si * the block size derived from the cctx, and sequences may be split. This is the default setting. * * If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain - * block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. + * valid block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. + * + * When ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, it's possible to decide generating repcodes + * using the advanced parameter ZSTD_c_repcodeResolution. Repcodes will improve compression ratio, though the benefit + * can vary greatly depending on Sequences. On the other hand, repcode resolution is an expensive operation. + * By default, it's disabled at low (<10) compression levels, and enabled above the threshold (>=10). + * ZSTD_c_repcodeResolution makes it possible to directly manage this processing in either direction. * - * If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined - * behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for - * specifics regarding offset/matchlength requirements) then the function will bail out and return an error. + * If ZSTD_c_validateSequences == 0, this function blindly accepts the Sequences provided. Invalid Sequences cause undefined + * behavior. If ZSTD_c_validateSequences == 1, then the function will detect invalid Sequences (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) and then bail out and return an error. * * In addition to the two adjustable experimental params, there are other important cctx params. * - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN. @@ -1410,14 +1638,42 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si * - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset * is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md * - * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. - * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, - * and cannot emit an RLE block that disagrees with the repcode history - * @return : final compressed size or a ZSTD error. - */ -ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, - const ZSTD_Sequence* inSeqs, size_t inSeqsSize, - const void* src, size_t srcSize); + * Note: Repcodes are, as of now, always re-calculated within this function, ZSTD_Sequence.rep is effectively unused. + * Dev Note: Once ability to ingest repcodes become available, the explicit block delims mode must respect those repcodes exactly, + * and cannot emit an RLE block that disagrees with the repcode history. + * @return : final compressed size, or a ZSTD error code. + */ +ZSTDLIB_STATIC_API size_t +ZSTD_compressSequences(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize); + + +/*! ZSTD_compressSequencesAndLiterals() : + * This is a variant of ZSTD_compressSequences() which, + * instead of receiving (src,srcSize) as input parameter, receives (literals,litSize), + * aka all the literals, already extracted and laid out into a single continuous buffer. + * This can be useful if the process generating the sequences also happens to generate the buffer of literals, + * thus skipping an extraction + caching stage. + * It's a speed optimization, useful when the right conditions are met, + * but it also features the following limitations: + * - Only supports explicit delimiter mode + * - Currently does not support Sequences validation (so input Sequences are trusted) + * - Not compatible with frame checksum, which must be disabled + * - If any block is incompressible, will fail and return an error + * - @litSize must be == sum of all @.litLength fields in @inSeqs. Any discrepancy will generate an error. + * - @litBufCapacity is the size of the underlying buffer into which literals are written, starting at address @literals. + * @litBufCapacity must be at least 8 bytes larger than @litSize. + * - @decompressedSize must be correct, and correspond to the sum of all Sequences. Any discrepancy will generate an error. + * @return : final compressed size, or a ZSTD error code. + */ +ZSTDLIB_STATIC_API size_t +ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t nbSequences, + const void* literals, size_t litSize, size_t litBufCapacity, + size_t decompressedSize); /*! ZSTD_writeSkippableFrame() : @@ -1425,8 +1681,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds * * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number, * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. - * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so - * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. + * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, + * so the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. * * Returns an error if destination buffer is not large enough, if the source size is not representable * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid). @@ -1434,26 +1690,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds * @return : number of bytes written or a ZSTD error. */ ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, - const void* src, size_t srcSize, unsigned magicVariant); + const void* src, size_t srcSize, + unsigned magicVariant); /*! ZSTD_readSkippableFrame() : - * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. + * Retrieves the content of a zstd skippable frame starting at @src, and writes it to @dst buffer. * - * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, - * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested - * in the magicVariant. + * The parameter @magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. + * This can be NULL if the caller is not interested in the magicVariant. * * Returns an error if destination buffer is not large enough, or if the frame is not skippable. * * @return : number of bytes written or a ZSTD error. */ -ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, - const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, + unsigned* magicVariant, + const void* src, size_t srcSize); /*! ZSTD_isSkippableFrame() : * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. */ -ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); +ZSTDLIB_STATIC_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); @@ -1464,48 +1722,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); /*! ZSTD_estimate*() : * These functions make it possible to estimate memory usage * of a future {D,C}Ctx, before its creation. + * This is useful in combination with ZSTD_initStatic(), + * which makes it possible to employ a static buffer for ZSTD_CCtx* state. * * ZSTD_estimateCCtxSize() will provide a memory budget large enough - * for any compression level up to selected one. - * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate - * does not include space for a window buffer. - * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. + * to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() + * associated with any compression level up to max specified one. * The estimate will assume the input may be arbitrarily large, * which is the worst case. * + * Note that the size estimation is specific for one-shot compression, + * it is not valid for streaming (see ZSTD_estimateCStreamSize*()) + * nor other potential ways of using a ZSTD_CCtx* state. + * * When srcSize can be bound by a known and rather "small" value, - * this fact can be used to provide a tighter estimation - * because the CCtx compression context will need less memory. - * This tighter estimation can be provided by more advanced functions + * this knowledge can be used to provide a tighter budget estimation + * because the ZSTD_CCtx* state will need less memory for small inputs. + * This tighter estimation can be provided by employing more advanced functions * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. * - * Note 2 : only single-threaded compression is supported. + * Note : only single-threaded compression is supported. * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. */ -ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel); ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); /*! ZSTD_estimateCStreamSize() : - * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. - * It will also consider src size to be arbitrarily "large", which is worst case. + * ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression + * using any compression level up to the max specified one. + * It will also consider src size to be arbitrarily "large", which is a worst case scenario. * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. * Note : CStream size estimation is only correct for single-threaded compression. - * ZSTD_DStream memory budget depends on window Size. + * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. + * Size estimates assume that no external sequence producer is registered. + * + * ZSTD_DStream memory budget depends on frame's window Size. * This information can be passed manually, using ZSTD_estimateDStreamSize, * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Any frame requesting a window size larger than max specified one will be rejected. * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), * an internal ?Dict will be created, which additional size is not estimated here. - * In this case, get total size by adding ZSTD_estimate?DictSize */ -ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); + * In this case, get total size by adding ZSTD_estimate?DictSize + */ +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel); ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); -ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize); ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); /*! ZSTD_estimate?DictSize() : @@ -1568,7 +1837,15 @@ typedef void (*ZSTD_freeFunction) (void* opaque, void* address); typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; static __attribute__((__unused__)) + +#if defined(__clang__) && __clang_major__ >= 5 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" +#endif ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /*< this constant defers to stdlib's functions */ +#if defined(__clang__) && __clang_major__ >= 5 +#pragma clang diagnostic pop +#endif ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); @@ -1649,22 +1926,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); * This function never fails (wide contract) */ ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); +/*! ZSTD_CCtx_setCParams() : + * Set all parameters provided within @p cparams into the working @p cctx. + * Note : if modifying parameters during compression (MT mode only), + * note that changes to the .windowLog parameter will be ignored. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()). + * On failure, no parameters are updated. + */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); + +/*! ZSTD_CCtx_setFParams() : + * Set all parameters provided within @p fparams into the working @p cctx. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()). + */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); + +/*! ZSTD_CCtx_setParams() : + * Set all parameters provided within @p params into the working @p cctx. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()). + */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); + /*! ZSTD_compress_advanced() : * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_compress2") +ZSTDLIB_STATIC_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const void* dict,size_t dictSize, - ZSTD_parameters params); + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params); /*! ZSTD_compress_usingCDict_advanced() : * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") +ZSTDLIB_STATIC_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, @@ -1725,7 +2025,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo * See the comments on that enum for an explanation of the feature. */ #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 -/* Controlled with ZSTD_paramSwitch_e enum. +/* Controlled with ZSTD_ParamSwitch_e enum. * Default is ZSTD_ps_auto. * Set to ZSTD_ps_disable to never compress literals. * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals @@ -1737,11 +2037,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo */ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 -/* Tries to fit compressed block size to be around targetCBlockSize. - * No target when targetCBlockSize == 0. - * There is no guarantee on compressed block size (default:0) */ -#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 - /* User's best guess of source size. * Hint is not valid when srcSizeHint == 0. * There is no guarantee that hint is close to actual source size, @@ -1808,13 +2103,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo * Experimental parameter. * Default is 0 == disabled. Set to 1 to enable. * - * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same - * between calls, except for the modifications that zstd makes to pos (the - * caller must not modify pos). This is checked by the compressor, and - * compression will fail if it ever changes. This means the only flush - * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end - * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) - * MUST not be modified during compression or you will get data corruption. + * Tells the compressor that input data presented with ZSTD_inBuffer + * will ALWAYS be the same between calls. + * Technically, the @src pointer must never be changed, + * and the @pos field can only be updated by zstd. + * However, it's possible to increase the @size field, + * allowing scenarios where more data can be appended after compressions starts. + * These conditions are checked by the compressor, + * and compression will fail if they are not respected. + * Also, data in the ZSTD_inBuffer within the range [src, src + pos) + * MUST not be modified during compression or it will result in data corruption. * * When this flag is enabled zstd won't allocate an input window buffer, * because the user guarantees it can reference the ZSTD_inBuffer until @@ -1822,18 +2120,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also * avoid the memcpy() from the input buffer to the input window buffer. * - * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. - * That means this flag cannot be used with ZSTD_compressStream(). - * * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using * this flag is ALWAYS memory safe, and will never access out-of-bounds - * memory. However, compression WILL fail if you violate the preconditions. + * memory. However, compression WILL fail if conditions are not respected. * - * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST - * not be modified during compression or you will get data corruption. This - * is because zstd needs to reference data in the ZSTD_inBuffer to find + * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST + * not be modified during compression or it will result in data corruption. + * This is because zstd needs to reference data in the ZSTD_inBuffer to find * matches. Normally zstd maintains its own window buffer for this purpose, - * but passing this flag tells zstd to use the user provided buffer. + * but passing this flag tells zstd to rely on user provided buffer instead. */ #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 @@ -1871,22 +2166,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo /* ZSTD_c_validateSequences * Default is 0 == disabled. Set to 1 to enable sequence validation. * - * For use with sequence compression API: ZSTD_compressSequences(). - * Designates whether or not we validate sequences provided to ZSTD_compressSequences() + * For use with sequence compression API: ZSTD_compressSequences*(). + * Designates whether or not provided sequences are validated within ZSTD_compressSequences*() * during function execution. * - * Without validation, providing a sequence that does not conform to the zstd spec will cause - * undefined behavior, and may produce a corrupted block. + * When Sequence validation is disabled (default), Sequences are compressed as-is, + * so they must correct, otherwise it would result in a corruption error. * - * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for + * Sequence validation adds some protection, by ensuring that all values respect boundary conditions. + * If a Sequence is detected invalid (see doc/zstd_compression_format.md for * specifics regarding offset/matchlength requirements) then the function will bail out and * return an error. - * */ #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 -/* ZSTD_c_useBlockSplitter - * Controlled with ZSTD_paramSwitch_e enum. +/* ZSTD_c_blockSplitterLevel + * note: this parameter only influences the first splitter stage, + * which is active before producing the sequences. + * ZSTD_c_splitAfterSequences controls the next splitter stage, + * which is active after sequence production. + * Note that both can be combined. + * Allowed values are between 0 and ZSTD_BLOCKSPLITTER_LEVEL_MAX included. + * 0 means "auto", which will select a value depending on current ZSTD_c_strategy. + * 1 means no splitting. + * Then, values from 2 to 6 are sorted in increasing cpu load order. + * + * Note that currently the first block is never split, + * to ensure expansion guarantees in presence of incompressible data. + */ +#define ZSTD_BLOCKSPLITTER_LEVEL_MAX 6 +#define ZSTD_c_blockSplitterLevel ZSTD_c_experimentalParam20 + +/* ZSTD_c_splitAfterSequences + * This is a stronger splitter algorithm, + * based on actual sequences previously produced by the selected parser. + * It's also slower, and as a consequence, mostly used for high compression levels. + * While the post-splitter does overlap with the pre-splitter, + * both can nonetheless be combined, + * notably with ZSTD_c_blockSplitterLevel at ZSTD_BLOCKSPLITTER_LEVEL_MAX, + * resulting in higher compression ratio than just one of them. + * * Default is ZSTD_ps_auto. * Set to ZSTD_ps_disable to never use block splitter. * Set to ZSTD_ps_enable to always use block splitter. @@ -1894,10 +2213,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use * block splitting based on the compression parameters. */ -#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13 +#define ZSTD_c_splitAfterSequences ZSTD_c_experimentalParam13 /* ZSTD_c_useRowMatchFinder - * Controlled with ZSTD_paramSwitch_e enum. + * Controlled with ZSTD_ParamSwitch_e enum. * Default is ZSTD_ps_auto. * Set to ZSTD_ps_disable to never use row-based matchfinder. * Set to ZSTD_ps_enable to force usage of row-based matchfinder. @@ -1928,6 +2247,80 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo */ #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 +/* ZSTD_c_prefetchCDictTables + * Controlled with ZSTD_ParamSwitch_e enum. Default is ZSTD_ps_auto. + * + * In some situations, zstd uses CDict tables in-place rather than copying them + * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). + * In such situations, compression speed is seriously impacted when CDict tables are + * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables + * when they are used in-place. + * + * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. + * For sufficiently large inputs, zstd will by default memcpy() CDict tables + * into the working context, so there is no need to prefetch. This parameter is + * targeted at a middle range of input sizes, where a prefetch is cheap enough to be + * useful but memcpy() is too expensive. The exact range of input sizes where this + * makes sense is best determined by careful experimentation. + * + * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, + * but in the future zstd may conditionally enable this feature via an auto-detection + * heuristic for cold CDicts. + * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. + */ +#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 + +/* ZSTD_c_enableSeqProducerFallback + * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. + * + * Controls whether zstd will fall back to an internal sequence producer if an + * external sequence producer is registered and returns an error code. This fallback + * is block-by-block: the internal sequence producer will only be called for blocks + * where the external sequence producer returns an error code. Fallback parsing will + * follow any other cParam settings, such as compression level, the same as in a + * normal (fully-internal) compression operation. + * + * The user is strongly encouraged to read the full Block-Level Sequence Producer API + * documentation (below) before setting this parameter. */ +#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 + +/* ZSTD_c_maxBlockSize + * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). + * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. + * + * This parameter can be used to set an upper bound on the blocksize + * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper + * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make + * compressBound() inaccurate). Only currently meant to be used for testing. + */ +#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 + +/* ZSTD_c_repcodeResolution + * This parameter only has an effect if ZSTD_c_blockDelimiters is + * set to ZSTD_sf_explicitBlockDelimiters (may change in the future). + * + * This parameter affects how zstd parses external sequences, + * provided via the ZSTD_compressSequences*() API + * or from an external block-level sequence producer. + * + * If set to ZSTD_ps_enable, the library will check for repeated offsets within + * external sequences, even if those repcodes are not explicitly indicated in + * the "rep" field. Note that this is the only way to exploit repcode matches + * while using compressSequences*() or an external sequence producer, since zstd + * currently ignores the "rep" field of external sequences. + * + * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in + * external sequences, regardless of whether the "rep" field has been set. This + * reduces sequence compression overhead by about 25% while sacrificing some + * compression ratio. + * + * The default value is ZSTD_ps_auto, for which the library will enable/disable + * based on compression level (currently: level<10 disables, level>=10 enables). + */ +#define ZSTD_c_repcodeResolution ZSTD_c_experimentalParam19 +#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 /* older name */ + + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. @@ -2084,7 +2477,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete * in the range [dst, dst + pos) MUST not be modified during decompression * or you will get data corruption. * - * When this flags is enabled zstd won't allocate an output buffer, because + * When this flag is enabled zstd won't allocate an output buffer, because * it can write directly to the ZSTD_outBuffer, but it will still allocate * an input buffer large enough to fit any compressed block. This will also * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. @@ -2137,6 +2530,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete */ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 +/* ZSTD_d_disableHuffmanAssembly + * Set to 1 to disable the Huffman assembly implementation. + * The default value is 0, which allows zstd to use the Huffman assembly + * implementation if available. + * + * This parameter can be used to disable Huffman assembly at runtime. + * If you want to disable it at compile time you can define the macro + * ZSTD_DISABLE_ASM. + */ +#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 + +/* ZSTD_d_maxBlockSize + * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). + * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. + * + * Forces the decompressor to reject blocks whose content size is + * larger than the configured maxBlockSize. When maxBlockSize is + * larger than the windowSize, the windowSize is used instead. + * This saves memory on the decoder when you know all blocks are small. + * + * This option is typically used in conjunction with ZSTD_c_maxBlockSize. + * + * WARNING: This causes the decoder to reject otherwise valid frames + * that have block sizes larger than the configured maxBlockSize. + */ +#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6 + /*! ZSTD_DCtx_setFormat() : * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). @@ -2145,6 +2565,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete * such ZSTD_f_zstd1_magicless for example. * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); /*! ZSTD_decompressStream_simpleArgs() : @@ -2181,6 +2602,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); @@ -2198,17 +2620,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /*! ZSTD_initCStream_advanced() : - * This function is DEPRECATED, and is approximately equivalent to: + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); - * // Pseudocode: Set each zstd parameter and leave the rest as-is. - * for ((param, value) : params) { - * ZSTD_CCtx_setParameter(zcs, param, value); - * } + * ZSTD_CCtx_setParams(zcs, params); * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); * @@ -2218,6 +2638,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, ZSTD_parameters params, @@ -2232,15 +2653,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); /*! ZSTD_initCStream_usingCDict_advanced() : - * This function is DEPRECATED, and is approximately equivalent to: + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); - * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. - * for ((fParam, value) : fParams) { - * ZSTD_CCtx_setParameter(zcs, fParam, value); - * } + * ZSTD_CCtx_setFParams(zcs, fParams); * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); * ZSTD_CCtx_refCDict(zcs, cdict); * @@ -2250,6 +2669,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, @@ -2264,7 +2684,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, * explicitly specified. * * start a new frame, using same parameters from previous frame. - * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. + * This is typically useful to skip dictionary loading stage, since it will reuse it in-place. * Note that zcs must be init at least once before using ZSTD_resetCStream(). * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. @@ -2274,6 +2694,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, * This prototype will generate compilation warnings. */ ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +ZSTDLIB_STATIC_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); @@ -2319,8 +2740,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); * * note: no dictionary will be used if dict == NULL or dictSize < 8 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ +ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /*! @@ -2330,8 +2751,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo * ZSTD_DCtx_refDDict(zds, ddict); * * note : ddict is referenced, it must outlive decompression session - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ +ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); /*! @@ -2339,18 +2760,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z * * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); * - * re-use decompression parameters from previous init; saves dictionary loading - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * reuse decompression parameters from previous init; saves dictionary loading */ +ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); +/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* + * + * *** OVERVIEW *** + * The Block-Level Sequence Producer API allows users to provide their own custom + * sequence producer which libzstd invokes to process each block. The produced list + * of sequences (literals and matches) is then post-processed by libzstd to produce + * valid compressed blocks. + * + * This block-level offload API is a more granular complement of the existing + * frame-level offload API compressSequences() (introduced in v1.5.1). It offers + * an easier migration story for applications already integrated with libzstd: the + * user application continues to invoke the same compression functions + * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits + * from the specific advantages of the external sequence producer. For example, + * the sequence producer could be tuned to take advantage of known characteristics + * of the input, to offer better speed / ratio, or could leverage hardware + * acceleration not available within libzstd itself. + * + * See contrib/externalSequenceProducer for an example program employing the + * Block-Level Sequence Producer API. + * + * *** USAGE *** + * The user is responsible for implementing a function of type + * ZSTD_sequenceProducer_F. For each block, zstd will pass the following + * arguments to the user-provided function: + * + * - sequenceProducerState: a pointer to a user-managed state for the sequence + * producer. + * + * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. + * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory + * backing outSeqs is managed by the CCtx. + * + * - src, srcSize: an input buffer for the sequence producer to parse. + * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. + * + * - dict, dictSize: a history buffer, which may be empty, which the sequence + * producer may reference as it parses the src buffer. Currently, zstd will + * always pass dictSize == 0 into external sequence producers, but this will + * change in the future. + * + * - compressionLevel: a signed integer representing the zstd compression level + * set by the user for the current operation. The sequence producer may choose + * to use this information to change its compression strategy and speed/ratio + * tradeoff. Note: the compression level does not reflect zstd parameters set + * through the advanced API. + * + * - windowSize: a size_t representing the maximum allowed offset for external + * sequences. Note that sequence offsets are sometimes allowed to exceed the + * windowSize if a dictionary is present, see doc/zstd_compression_format.md + * for details. + * + * The user-provided function shall return a size_t representing the number of + * sequences written to outSeqs. This return value will be treated as an error + * code if it is greater than outSeqsCapacity. The return value must be non-zero + * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided + * for convenience, but any value greater than outSeqsCapacity will be treated as + * an error code. + * + * If the user-provided function does not return an error code, the sequences + * written to outSeqs must be a valid parse of the src buffer. Data corruption may + * occur if the parse is not valid. A parse is defined to be valid if the + * following conditions hold: + * - The sum of matchLengths and literalLengths must equal srcSize. + * - All sequences in the parse, except for the final sequence, must have + * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have + * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. + * - All offsets must respect the windowSize parameter as specified in + * doc/zstd_compression_format.md. + * - If the final sequence has matchLength == 0, it must also have offset == 0. + * + * zstd will only validate these conditions (and fail compression if they do not + * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence + * validation has a performance cost. + * + * If the user-provided function returns an error, zstd will either fall back + * to an internal sequence producer or fail the compression operation. The user can + * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback + * cParam. Fallback compression will follow any other cParam settings, such as + * compression level, the same as in a normal compression operation. + * + * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F + * function by calling + * ZSTD_registerSequenceProducer(cctx, + * sequenceProducerState, + * sequenceProducer) + * This setting will persist until the next parameter reset of the CCtx. + * + * The sequenceProducerState must be initialized by the user before calling + * ZSTD_registerSequenceProducer(). The user is responsible for destroying the + * sequenceProducerState. + * + * *** LIMITATIONS *** + * This API is compatible with all zstd compression APIs which respect advanced parameters. + * However, there are three limitations: + * + * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. + * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level + * external sequence producer. + * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some + * cases (see its documentation for details). Users must explicitly set + * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external + * sequence producer is registered. + * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default + * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should + * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence + * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). + * + * Second, history buffers are not currently supported. Concretely, zstd will always pass + * dictSize == 0 to the external sequence producer (for now). This has two implications: + * - Dictionaries are not currently supported. Compression will *not* fail if the user + * references a dictionary, but the dictionary won't have any effect. + * - Stream history is not currently supported. All advanced compression APIs, including + * streaming APIs, work with external sequence producers, but each block is treated as + * an independent chunk without history from previous blocks. + * + * Third, multi-threading within a single compression is not currently supported. In other words, + * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. + * Multi-threading across compressions is fine: simply create one CCtx per thread. + * + * Long-term, we plan to overcome all three limitations. There is no technical blocker to + * overcoming them. It is purely a question of engineering effort. + */ + +#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) + +typedef size_t (*ZSTD_sequenceProducer_F) ( + void* sequenceProducerState, + ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel, + size_t windowSize +); + +/*! ZSTD_registerSequenceProducer() : + * Instruct zstd to use a block-level external sequence producer function. + * + * The sequenceProducerState must be initialized by the caller, and the caller is + * responsible for managing its lifetime. This parameter is sticky across + * compressions. It will remain set until the user explicitly resets compression + * parameters. + * + * Sequence producer registration is considered to be an "advanced parameter", + * part of the "advanced API". This means it will only have an effect on compression + * APIs which respect advanced parameters, such as compress2() and compressStream2(). + * Older compression APIs such as compressCCtx(), which predate the introduction of + * "advanced parameters", will ignore any external sequence producer setting. + * + * The sequence producer can be "cleared" by registering a NULL function pointer. This + * removes all limitations described above in the "LIMITATIONS" section of the API docs. + * + * The user is strongly encouraged to read the full API documentation (above) before + * calling this function. */ +ZSTDLIB_STATIC_API void +ZSTD_registerSequenceProducer( + ZSTD_CCtx* cctx, + void* sequenceProducerState, + ZSTD_sequenceProducer_F sequenceProducer +); + +/*! ZSTD_CCtxParams_registerSequenceProducer() : + * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. + * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), + * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). + * + * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() + * is required, then this function is for you. Otherwise, you probably don't need it. + * + * See tests/zstreamtest.c for example usage. */ +ZSTDLIB_STATIC_API void +ZSTD_CCtxParams_registerSequenceProducer( + ZSTD_CCtx_params* params, + void* sequenceProducerState, + ZSTD_sequenceProducer_F sequenceProducer +); + + /* ******************************************************************* -* Buffer-less and synchronous inner streaming functions +* Buffer-less and synchronous inner streaming functions (DEPRECATED) +* +* This API is deprecated, and will be removed in a future version. +* It allows streaming (de)compression with user allocated buffers. +* However, it is hard to use, and not as well tested as the rest of +* our API. * -* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. -* But it's also a complex one, with several restrictions, documented below. -* Prefer normal streaming API for an easier experience. +* Please use the normal streaming API instead: ZSTD_compressStream2, +* and ZSTD_decompressStream. +* If there is functionality that you need, but it doesn't provide, +* please open an issue on our GitHub. ********************************************************************* */ /* @@ -2358,11 +2963,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); A ZSTD_CCtx object is required to track streaming operations. Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. - ZSTD_CCtx object can be re-used multiple times within successive compression operations. + ZSTD_CCtx object can be reused multiple times within successive compression operations. Start by initializing a context. Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. - It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() Then, consume your input using ZSTD_compressContinue(). There are some important considerations to keep in mind when using this advanced function : @@ -2380,39 +2984,49 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. - `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. + `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again. */ /*===== Buffer-less streaming compression functions =====*/ +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ -ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") +ZSTDLIB_STATIC_API +size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ ZSTD_DEPRECATED("use advanced API to access custom parameters") +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ ZSTD_DEPRECATED("use advanced API to access custom parameters") +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ /* Buffer-less streaming decompression (synchronous mode) A ZSTD_DCtx object is required to track streaming operations. Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. - A ZSTD_DCtx object can be re-used multiple times. + A ZSTD_DCtx object can be reused multiple times. First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. Data fragment must be large enough to ensure successful decoding. `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. - @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. - >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. + result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. + >0 : `srcSize` is too small, please provide at least result bytes on next attempt. errorCode, which can be tested using ZSTD_isError(). - It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, + It fills a ZSTD_FrameHeader structure with important information to correctly decode the frame, such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. As a consequence, check that values remain within valid application range. @@ -2428,7 +3042,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ The most memory efficient way is to use a round buffer of sufficient size. Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), - which can @return an error code if required value is too large for current system (in 32-bits mode). + which can return an error code if required value is too large for current system (in 32-bits mode). In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, up to the moment there is not enough room left in the buffer to guarantee decoding another full block, which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. @@ -2448,7 +3062,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. - @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. It can also be an error code, which can be tested with ZSTD_isError(). @@ -2471,27 +3085,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ */ /*===== Buffer-less streaming decompression functions =====*/ -typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; -typedef struct { - unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ - unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ - unsigned blockSizeMax; - ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ - unsigned headerSize; - unsigned dictID; - unsigned checksumFlag; -} ZSTD_frameHeader; -/*! ZSTD_getFrameHeader() : - * decode Frame Header, or requires larger `srcSize`. - * @return : 0, `zfhPtr` is correctly filled, - * >0, `srcSize` is too small, value is wanted `srcSize` amount, - * or an error code, which can be tested using ZSTD_isError() */ -ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ -/*! ZSTD_getFrameHeader_advanced() : - * same as ZSTD_getFrameHeader(), - * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ -ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); @@ -2502,6 +3096,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); /* misc */ +ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); @@ -2509,11 +3104,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); -/* ============================ */ -/* Block level API */ -/* ============================ */ +/* ========================================= */ +/* Block level API (DEPRECATED) */ +/* ========================================= */ /*! + + This API is deprecated in favor of the regular compression API. + You can get the frame header down to 2 bytes by setting: + - ZSTD_c_format = ZSTD_f_zstd1_magicless + - ZSTD_c_contentSizeFlag = 0 + - ZSTD_c_checksumFlag = 0 + - ZSTD_c_dictIDFlag = 0 + + This API is not as well tested as our normal API, so we recommend not using it. + We will be removing it in a future version. If the normal API doesn't provide + the functionality you need, please open a GitHub issue. + Block functions produce and decode raw zstd blocks, without frame metadata. Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. @@ -2524,7 +3131,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); - It is necessary to init context before starting + compression : any ZSTD_compressBegin*() variant, including with dictionary + decompression : any ZSTD_decompressBegin*() variant, including with dictionary - + copyCCtx() and copyDCtx() can be used too - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + If input is larger than a block size, it's necessary to split input data into multiple blocks + For inputs larger than a single block, consider using regular ZSTD_compress() instead. @@ -2541,11 +3147,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); */ /*===== Raw zstd block functions =====*/ +ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ - -- cgit v1.2.3 From 7dc0dddbe503fce55ffe31ae5e8a861256a31e3f Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Fri, 10 Jan 2025 10:19:16 +0100 Subject: mfd: stm32-timers: Add support for stm32mp25 Add support for STM32MP25 SoC. Use newly introduced compatible, to handle new features. Identification and hardware configuration registers allow to read the timer version and capabilities (counter width, number of channels...). So, rework the probe to avoid touching ARR register by simply read the counter width when available. This may avoid messing with a possibly running timer. Also add useful bit fields to stm32-timers header file. Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/r/20250110091922.980627-3-fabrice.gasnier@foss.st.com Signed-off-by: Lee Jones --- include/linux/mfd/stm32-timers.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h index f09ba598c97a..23b0cae4a9f8 100644 --- a/include/linux/mfd/stm32-timers.h +++ b/include/linux/mfd/stm32-timers.h @@ -33,6 +33,9 @@ #define TIM_DCR 0x48 /* DMA control register */ #define TIM_DMAR 0x4C /* DMA register for transfer */ #define TIM_TISEL 0x68 /* Input Selection */ +#define TIM_HWCFGR2 0x3EC /* hardware configuration 2 Reg (MP25) */ +#define TIM_HWCFGR1 0x3F0 /* hardware configuration 1 Reg (MP25) */ +#define TIM_IPIDR 0x3F8 /* IP identification Reg (MP25) */ #define TIM_CR1_CEN BIT(0) /* Counter Enable */ #define TIM_CR1_DIR BIT(4) /* Counter Direction */ @@ -100,6 +103,9 @@ #define TIM_BDTR_BKF(x) (0xf << (16 + (x) * 4)) #define TIM_DCR_DBA GENMASK(4, 0) /* DMA base addr */ #define TIM_DCR_DBL GENMASK(12, 8) /* DMA burst len */ +#define TIM_HWCFGR1_NB_OF_CC GENMASK(3, 0) /* Capture/compare channels */ +#define TIM_HWCFGR1_NB_OF_DT GENMASK(7, 4) /* Complementary outputs & dead-time generators */ +#define TIM_HWCFGR2_CNT_WIDTH GENMASK(15, 8) /* Counter width */ #define MAX_TIM_PSC 0xFFFF #define MAX_TIM_ICPSC 0x3 @@ -113,6 +119,8 @@ #define TIM_BDTR_BKF_MASK 0xF #define TIM_BDTR_BKF_SHIFT(x) (16 + (x) * 4) +#define STM32MP25_TIM_IPIDR 0x00120002 + enum stm32_timers_dmas { STM32_TIMERS_DMA_CH1, STM32_TIMERS_DMA_CH2, @@ -151,6 +159,7 @@ struct stm32_timers_dma { struct stm32_timers { struct clk *clk; + u32 ipidr; struct regmap *regmap; u32 max_arr; struct stm32_timers_dma dma; /* Only to be used by the parent */ -- cgit v1.2.3 From 7f9ed27eead6e65425b272448dd36791d6c823ff Mon Sep 17 00:00:00 2001 From: Shree Ramamoorthy Date: Thu, 6 Feb 2025 11:37:24 -0600 Subject: mfd: tps65219: Add support for TI TPS65215 PMIC Use chip ID and chip_data struct to differentiate between devices in probe(). Add TPS65215 resource information. Update descriptions and copyright information to reflect the driver supports 2 PMIC devices. Signed-off-by: Shree Ramamoorthy Link: https://lore.kernel.org/r/20250206173725.386720-5-s-ramamoorthy@ti.com Signed-off-by: Lee Jones --- include/linux/mfd/tps65219.h | 71 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tps65219.h b/include/linux/mfd/tps65219.h index 546bceec7173..6047f92b367f 100644 --- a/include/linux/mfd/tps65219.h +++ b/include/linux/mfd/tps65219.h @@ -1,8 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Functions to access TPS65219 Power Management IC. + * Functions to access TPS65215/TPS65219 Power Management Integrated Chips * * Copyright (C) 2022 BayLibre Incorporated - https://www.baylibre.com/ + * Copyright (C) 2024 Texas Instruments Incorporated - https://www.ti.com/ */ #ifndef MFD_TPS65219_H @@ -13,8 +14,11 @@ #include #include -/* TPS chip id list */ -#define TPS65219 0xF0 +/* Chip id list*/ +enum pmic_id { + TPS65215, + TPS65219, +}; /* I2C ID for TPS65219 part */ #define TPS65219_I2C_ID 0x24 @@ -26,6 +30,7 @@ #define TPS65219_REG_BUCKS_CONFIG 0x03 #define TPS65219_REG_LDO4_VOUT 0x04 #define TPS65219_REG_LDO3_VOUT 0x05 +#define TPS65215_REG_LDO2_VOUT 0x05 #define TPS65219_REG_LDO2_VOUT 0x06 #define TPS65219_REG_LDO1_VOUT 0x07 #define TPS65219_REG_BUCK3_VOUT 0x8 @@ -33,6 +38,7 @@ #define TPS65219_REG_BUCK1_VOUT 0xA #define TPS65219_REG_LDO4_SEQUENCE_SLOT 0xB #define TPS65219_REG_LDO3_SEQUENCE_SLOT 0xC +#define TPS65215_REG_LDO2_SEQUENCE_SLOT 0xC #define TPS65219_REG_LDO2_SEQUENCE_SLOT 0xD #define TPS65219_REG_LDO1_SEQUENCE_SLOT 0xE #define TPS65219_REG_BUCK3_SEQUENCE_SLOT 0xF @@ -67,9 +73,16 @@ #define TPS65219_REG_DISCHARGE_CONFIG 0x2A /* main irq registers */ #define TPS65219_REG_INT_SOURCE 0x2B -/* 'sub irq' registers */ + +/* TPS65219 'sub irq' registers */ #define TPS65219_REG_INT_LDO_3_4 0x2C #define TPS65219_REG_INT_LDO_1_2 0x2D + +/* TPS65215 specific 'sub irq' registers */ +#define TPS65215_REG_INT_LDO_2 0x2C +#define TPS65215_REG_INT_LDO_1 0x2D + +/* Common TPS65215 & TPS65219 'sub irq' registers */ #define TPS65219_REG_INT_BUCK_3 0x2E #define TPS65219_REG_INT_BUCK_1_2 0x2F #define TPS65219_REG_INT_SYSTEM 0x30 @@ -86,6 +99,9 @@ #define TPS65219_REG_INT_TO_RV_POS 6 #define TPS65219_REG_INT_PB_POS 7 +#define TPS65215_REG_INT_LDO_2_POS 0 +#define TPS65215_REG_INT_LDO_1_POS 1 + #define TPS65219_REG_USER_NVM_CMD 0x34 #define TPS65219_REG_POWER_UP_STATUS 0x35 #define TPS65219_REG_SPARE_2 0x36 @@ -107,6 +123,7 @@ #define TPS65219_ENABLE_LDO1_EN_MASK BIT(3) #define TPS65219_ENABLE_LDO2_EN_MASK BIT(4) #define TPS65219_ENABLE_LDO3_EN_MASK BIT(5) +#define TPS65215_ENABLE_LDO2_EN_MASK BIT(5) #define TPS65219_ENABLE_LDO4_EN_MASK BIT(6) /* power ON-OFF sequence slot */ #define TPS65219_BUCKS_LDOS_SEQUENCE_OFF_SLOT_MASK GENMASK(3, 0) @@ -172,6 +189,13 @@ #define TPS65219_INT_LDO2_SCG_MASK BIT(3) #define TPS65219_INT_LDO2_OC_MASK BIT(4) #define TPS65219_INT_LDO2_UV_MASK BIT(5) +/* TPS65215 LDO1-2*/ +#define TPS65215_INT_LDO1_SCG_MASK BIT(0) +#define TPS65215_INT_LDO1_OC_MASK BIT(1) +#define TPS65215_INT_LDO1_UV_MASK BIT(2) +#define TPS65215_INT_LDO2_SCG_MASK BIT(0) +#define TPS65215_INT_LDO2_OC_MASK BIT(1) +#define TPS65215_INT_LDO2_UV_MASK BIT(2) /* BUCK3 */ #define TPS65219_INT_BUCK3_SCG_MASK BIT(0) #define TPS65219_INT_BUCK3_OC_MASK BIT(1) @@ -202,6 +226,7 @@ #define TPS65219_INT_LDO1_RV_MASK BIT(3) #define TPS65219_INT_LDO2_RV_MASK BIT(4) #define TPS65219_INT_LDO3_RV_MASK BIT(5) +#define TPS65215_INT_LDO2_RV_MASK BIT(5) #define TPS65219_INT_LDO4_RV_MASK BIT(6) /* Residual Voltage ShutDown */ #define TPS65219_INT_BUCK1_RV_SD_MASK BIT(0) @@ -210,6 +235,7 @@ #define TPS65219_INT_LDO1_RV_SD_MASK BIT(3) #define TPS65219_INT_LDO2_RV_SD_MASK BIT(4) #define TPS65219_INT_LDO3_RV_SD_MASK BIT(5) +#define TPS65215_INT_LDO2_RV_SD_MASK BIT(5) #define TPS65219_INT_LDO4_RV_SD_MASK BIT(6) #define TPS65219_INT_TIMEOUT_MASK BIT(7) /* Power Button */ @@ -235,6 +261,14 @@ enum { TPS65219_INT_LDO4_SCG, TPS65219_INT_LDO4_OC, TPS65219_INT_LDO4_UV, + /* TPS65215 LDO1*/ + TPS65215_INT_LDO1_SCG, + TPS65215_INT_LDO1_OC, + TPS65215_INT_LDO1_UV, + /* TPS65215 LDO2*/ + TPS65215_INT_LDO2_SCG, + TPS65215_INT_LDO2_OC, + TPS65215_INT_LDO2_UV, /* LDO1-2 */ TPS65219_INT_LDO1_SCG, TPS65219_INT_LDO1_OC, @@ -271,6 +305,7 @@ enum { TPS65219_INT_BUCK3_RV, TPS65219_INT_LDO1_RV, TPS65219_INT_LDO2_RV, + TPS65215_INT_LDO2_RV, TPS65219_INT_LDO3_RV, TPS65219_INT_LDO4_RV, /* Residual Voltage ShutDown */ @@ -278,6 +313,7 @@ enum { TPS65219_INT_BUCK2_RV_SD, TPS65219_INT_BUCK3_RV_SD, TPS65219_INT_LDO1_RV_SD, + TPS65215_INT_LDO2_RV_SD, TPS65219_INT_LDO2_RV_SD, TPS65219_INT_LDO3_RV_SD, TPS65219_INT_LDO4_RV_SD, @@ -287,6 +323,12 @@ enum { TPS65219_INT_PB_RISING_EDGE_DETECT, }; +enum tps65215_regulator_id { + /* DCDC's same as TPS65219 */ + /* LDO1 is the same as TPS65219 */ + TPS65215_LDO_2 = 4, +}; + enum tps65219_regulator_id { /* DCDC's */ TPS65219_BUCK_1, @@ -300,11 +342,26 @@ enum tps65219_regulator_id { }; /* Number of step-down converters available */ -#define TPS65219_NUM_DCDC 3 +#define TPS6521X_NUM_BUCKS 3 /* Number of LDO voltage regulators available */ #define TPS65219_NUM_LDO 4 +#define TPS65215_NUM_LDO 2 /* Number of total regulators available */ -#define TPS65219_NUM_REGULATOR (TPS65219_NUM_DCDC + TPS65219_NUM_LDO) +#define TPS65219_NUM_REGULATOR (TPS6521X_NUM_BUCKS + TPS65219_NUM_LDO) +#define TPS65215_NUM_REGULATOR (TPS6521X_NUM_BUCKS + TPS65215_NUM_LDO) + +/* Define the TPS65215 IRQ numbers */ +enum tps65215_irqs { + /* INT source registers */ + TPS65215_TO_RV_SD_SET_IRQ, + TPS65215_RV_SET_IRQ, + TPS65215_SYS_SET_IRQ, + TPS65215_BUCK_1_2_SET_IRQ, + TPS65215_BUCK_3_SET_IRQ, + TPS65215_LDO_1_SET_IRQ, + TPS65215_LDO_2_SET_IRQ, + TPS65215_PB_SET_IRQ, +}; /* Define the TPS65219 IRQ numbers */ enum tps65219_irqs { @@ -326,6 +383,7 @@ enum tps65219_irqs { * * @dev: MFD device * @regmap: Regmap for accessing the device registers + * @chip_id: Chip ID * @irq_data: Regmap irq data used for the irq chip * @nb: notifier block for the restart handler */ @@ -333,6 +391,7 @@ struct tps65219 { struct device *dev; struct regmap *regmap; + unsigned int chip_id; struct regmap_irq_chip_data *irq_data; struct notifier_block nb; }; -- cgit v1.2.3 From 7947219ab1a2d4d6ee3e8103ef4345cb00ab565b Mon Sep 17 00:00:00 2001 From: Shree Ramamoorthy Date: Thu, 6 Feb 2025 11:37:25 -0600 Subject: mfd: tps65219: Add support for TI TPS65214 PMIC Use chip ID and chip_data struct to differentiate between 3 PMIC devices in probe(). Add TPS65214 resource information. Update descriptions and copyright information to reflect the driver supports 3 PMIC devices. Signed-off-by: Shree Ramamoorthy Link: https://lore.kernel.org/r/20250206173725.386720-6-s-ramamoorthy@ti.com Signed-off-by: Lee Jones --- include/linux/mfd/tps65219.h | 65 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tps65219.h b/include/linux/mfd/tps65219.h index 6047f92b367f..3e8d29189267 100644 --- a/include/linux/mfd/tps65219.h +++ b/include/linux/mfd/tps65219.h @@ -16,6 +16,7 @@ /* Chip id list*/ enum pmic_id { + TPS65214, TPS65215, TPS65219, }; @@ -28,17 +29,23 @@ enum pmic_id { #define TPS65219_REG_NVM_ID 0x01 #define TPS65219_REG_ENABLE_CTRL 0x02 #define TPS65219_REG_BUCKS_CONFIG 0x03 +#define TPS65214_REG_LOCK 0x03 #define TPS65219_REG_LDO4_VOUT 0x04 +#define TPS65214_REG_LDO1_VOUT_STBY 0x04 #define TPS65219_REG_LDO3_VOUT 0x05 #define TPS65215_REG_LDO2_VOUT 0x05 +#define TPS65214_REG_LDO1_VOUT 0x05 #define TPS65219_REG_LDO2_VOUT 0x06 +#define TPS65214_REG_LDO2_VOUT 0x06 #define TPS65219_REG_LDO1_VOUT 0x07 +#define TPS65214_REG_LDO2_VOUT_STBY 0x07 #define TPS65219_REG_BUCK3_VOUT 0x8 #define TPS65219_REG_BUCK2_VOUT 0x9 #define TPS65219_REG_BUCK1_VOUT 0xA #define TPS65219_REG_LDO4_SEQUENCE_SLOT 0xB #define TPS65219_REG_LDO3_SEQUENCE_SLOT 0xC #define TPS65215_REG_LDO2_SEQUENCE_SLOT 0xC +#define TPS65214_REG_LDO1_SEQUENCE_SLOT 0xC #define TPS65219_REG_LDO2_SEQUENCE_SLOT 0xD #define TPS65219_REG_LDO1_SEQUENCE_SLOT 0xE #define TPS65219_REG_BUCK3_SEQUENCE_SLOT 0xF @@ -47,15 +54,21 @@ enum pmic_id { #define TPS65219_REG_nRST_SEQUENCE_SLOT 0x12 #define TPS65219_REG_GPIO_SEQUENCE_SLOT 0x13 #define TPS65219_REG_GPO2_SEQUENCE_SLOT 0x14 +#define TPS65214_REG_GPIO_GPI_SEQUENCE_SLOT 0x14 #define TPS65219_REG_GPO1_SEQUENCE_SLOT 0x15 +#define TPS65214_REG_GPO_SEQUENCE_SLOT 0x15 #define TPS65219_REG_POWER_UP_SLOT_DURATION_1 0x16 #define TPS65219_REG_POWER_UP_SLOT_DURATION_2 0x17 +/* _SLOT_DURATION_3 doesn't apply to TPS65215*/ #define TPS65219_REG_POWER_UP_SLOT_DURATION_3 0x18 #define TPS65219_REG_POWER_UP_SLOT_DURATION_4 0x19 +#define TPS65214_REG_BUCK3_VOUT_STBY 0x19 #define TPS65219_REG_POWER_DOWN_SLOT_DURATION_1 0x1A #define TPS65219_REG_POWER_DOWN_SLOT_DURATION_2 0x1B #define TPS65219_REG_POWER_DOWN_SLOT_DURATION_3 0x1C +#define TPS65214_REG_BUCK2_VOUT_STBY 0x1C #define TPS65219_REG_POWER_DOWN_SLOT_DURATION_4 0x1D +#define TPS65214_REG_BUCK1_VOUT_STBY 0x1D #define TPS65219_REG_GENERAL_CONFIG 0x1E #define TPS65219_REG_MFP_1_CONFIG 0x1F #define TPS65219_REG_MFP_2_CONFIG 0x20 @@ -82,6 +95,9 @@ enum pmic_id { #define TPS65215_REG_INT_LDO_2 0x2C #define TPS65215_REG_INT_LDO_1 0x2D +/* TPS65214 specific 'sub irq' register */ +#define TPS65214_REG_INT_LDO_1_2 0x2D + /* Common TPS65215 & TPS65219 'sub irq' registers */ #define TPS65219_REG_INT_BUCK_3 0x2E #define TPS65219_REG_INT_BUCK_1_2 0x2F @@ -102,6 +118,14 @@ enum pmic_id { #define TPS65215_REG_INT_LDO_2_POS 0 #define TPS65215_REG_INT_LDO_1_POS 1 +#define TPS65214_REG_INT_LDO_1_2_POS 0 +#define TPS65214_REG_INT_BUCK_3_POS 1 +#define TPS65214_REG_INT_BUCK_1_2_POS 2 +#define TPS65214_REG_INT_SYS_POS 3 +#define TPS65214_REG_INT_RV_POS 4 +#define TPS65214_REG_INT_TO_RV_POS 5 +#define TPS65214_REG_INT_PB_POS 6 + #define TPS65219_REG_USER_NVM_CMD 0x34 #define TPS65219_REG_POWER_UP_STATUS 0x35 #define TPS65219_REG_SPARE_2 0x36 @@ -124,6 +148,7 @@ enum pmic_id { #define TPS65219_ENABLE_LDO2_EN_MASK BIT(4) #define TPS65219_ENABLE_LDO3_EN_MASK BIT(5) #define TPS65215_ENABLE_LDO2_EN_MASK BIT(5) +#define TPS65214_ENABLE_LDO1_EN_MASK BIT(5) #define TPS65219_ENABLE_LDO4_EN_MASK BIT(6) /* power ON-OFF sequence slot */ #define TPS65219_BUCKS_LDOS_SEQUENCE_OFF_SLOT_MASK GENMASK(3, 0) @@ -175,14 +200,14 @@ enum pmic_id { #define TPS65219_REG_MASK_EFFECT_MASK GENMASK(2, 1) #define TPS65219_REG_MASK_INT_FOR_PB_MASK BIT(7) /* UnderVoltage - Short to GND - OverCurrent*/ -/* LDO3-4 */ +/* LDO3-4: only for TPS65219*/ #define TPS65219_INT_LDO3_SCG_MASK BIT(0) #define TPS65219_INT_LDO3_OC_MASK BIT(1) #define TPS65219_INT_LDO3_UV_MASK BIT(2) #define TPS65219_INT_LDO4_SCG_MASK BIT(3) #define TPS65219_INT_LDO4_OC_MASK BIT(4) #define TPS65219_INT_LDO4_UV_MASK BIT(5) -/* LDO1-2 */ +/* LDO1-2: TPS65214 & TPS65219 */ #define TPS65219_INT_LDO1_SCG_MASK BIT(0) #define TPS65219_INT_LDO1_OC_MASK BIT(1) #define TPS65219_INT_LDO1_UV_MASK BIT(2) @@ -210,12 +235,13 @@ enum pmic_id { #define TPS65219_INT_BUCK2_OC_MASK BIT(5) #define TPS65219_INT_BUCK2_NEG_OC_MASK BIT(6) #define TPS65219_INT_BUCK2_UV_MASK BIT(7) -/* Thermal Sensor */ +/* Thermal Sensor: TPS65219/TPS65215 */ #define TPS65219_INT_SENSOR_3_WARM_MASK BIT(0) +#define TPS65219_INT_SENSOR_3_HOT_MASK BIT(4) +/* Thermal Sensor: TPS65219/TPS65215/TPS65214 */ #define TPS65219_INT_SENSOR_2_WARM_MASK BIT(1) #define TPS65219_INT_SENSOR_1_WARM_MASK BIT(2) #define TPS65219_INT_SENSOR_0_WARM_MASK BIT(3) -#define TPS65219_INT_SENSOR_3_HOT_MASK BIT(4) #define TPS65219_INT_SENSOR_2_HOT_MASK BIT(5) #define TPS65219_INT_SENSOR_1_HOT_MASK BIT(6) #define TPS65219_INT_SENSOR_0_HOT_MASK BIT(7) @@ -227,6 +253,7 @@ enum pmic_id { #define TPS65219_INT_LDO2_RV_MASK BIT(4) #define TPS65219_INT_LDO3_RV_MASK BIT(5) #define TPS65215_INT_LDO2_RV_MASK BIT(5) +#define TPS65214_INT_LDO2_RV_MASK BIT(5) #define TPS65219_INT_LDO4_RV_MASK BIT(6) /* Residual Voltage ShutDown */ #define TPS65219_INT_BUCK1_RV_SD_MASK BIT(0) @@ -236,6 +263,7 @@ enum pmic_id { #define TPS65219_INT_LDO2_RV_SD_MASK BIT(4) #define TPS65219_INT_LDO3_RV_SD_MASK BIT(5) #define TPS65215_INT_LDO2_RV_SD_MASK BIT(5) +#define TPS65214_INT_LDO1_RV_SD_MASK BIT(5) #define TPS65219_INT_LDO4_RV_SD_MASK BIT(6) #define TPS65219_INT_TIMEOUT_MASK BIT(7) /* Power Button */ @@ -269,7 +297,7 @@ enum { TPS65215_INT_LDO2_SCG, TPS65215_INT_LDO2_OC, TPS65215_INT_LDO2_UV, - /* LDO1-2 */ + /* LDO1-2: TPS65219/TPS65214 */ TPS65219_INT_LDO1_SCG, TPS65219_INT_LDO1_OC, TPS65219_INT_LDO1_UV, @@ -306,6 +334,7 @@ enum { TPS65219_INT_LDO1_RV, TPS65219_INT_LDO2_RV, TPS65215_INT_LDO2_RV, + TPS65214_INT_LDO2_RV, TPS65219_INT_LDO3_RV, TPS65219_INT_LDO4_RV, /* Residual Voltage ShutDown */ @@ -313,6 +342,7 @@ enum { TPS65219_INT_BUCK2_RV_SD, TPS65219_INT_BUCK3_RV_SD, TPS65219_INT_LDO1_RV_SD, + TPS65214_INT_LDO1_RV_SD, TPS65215_INT_LDO2_RV_SD, TPS65219_INT_LDO2_RV_SD, TPS65219_INT_LDO3_RV_SD, @@ -323,6 +353,17 @@ enum { TPS65219_INT_PB_RISING_EDGE_DETECT, }; +enum tps65214_regulator_id { + /* + * DCDC's same as TPS65219 + * LDO1 maps to TPS65219's LDO3 + * LDO2 is the same as TPS65219 + * + */ + TPS65214_LDO_1 = 3, + TPS65214_LDO_2 = 4, +}; + enum tps65215_regulator_id { /* DCDC's same as TPS65219 */ /* LDO1 is the same as TPS65219 */ @@ -346,9 +387,23 @@ enum tps65219_regulator_id { /* Number of LDO voltage regulators available */ #define TPS65219_NUM_LDO 4 #define TPS65215_NUM_LDO 2 +#define TPS65214_NUM_LDO 2 /* Number of total regulators available */ #define TPS65219_NUM_REGULATOR (TPS6521X_NUM_BUCKS + TPS65219_NUM_LDO) #define TPS65215_NUM_REGULATOR (TPS6521X_NUM_BUCKS + TPS65215_NUM_LDO) +#define TPS65214_NUM_REGULATOR (TPS6521X_NUM_BUCKS + TPS65214_NUM_LDO) + +/* Define the TPS65214 IRQ numbers */ +enum tps65214_irqs { + /* INT source registers */ + TPS65214_TO_RV_SD_SET_IRQ, + TPS65214_RV_SET_IRQ, + TPS65214_SYS_SET_IRQ, + TPS65214_BUCK_1_2_SET_IRQ, + TPS65214_BUCK_3_SET_IRQ, + TPS65214_LDO_1_2_SET_IRQ, + TPS65214_PB_SET_IRQ = 7, +}; /* Define the TPS65215 IRQ numbers */ enum tps65215_irqs { -- cgit v1.2.3 From 417206a22669a0adeeaeedab2c8364fd021e3533 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 11 Feb 2025 14:28:05 +0000 Subject: mfd: max8997: Remove unused function max8997_irq_exit() Utilise devm_*() managed resource helpers for freeing IRQs instead. Reported-by: Dr. David Alan Gilbert Signed-off-by: Lee Jones --- include/linux/mfd/max8997-private.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max8997-private.h b/include/linux/mfd/max8997-private.h index a10cd6945232..f70eea0f2264 100644 --- a/include/linux/mfd/max8997-private.h +++ b/include/linux/mfd/max8997-private.h @@ -397,7 +397,6 @@ enum max8997_types { }; extern int max8997_irq_init(struct max8997_dev *max8997); -extern void max8997_irq_exit(struct max8997_dev *max8997); extern int max8997_irq_resume(struct max8997_dev *max8997); extern int max8997_read_reg(struct i2c_client *i2c, u8 reg, u8 *dest); -- cgit v1.2.3 From 860e98066f5ef2b87a5df7b575a85a4d57adb5e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 12 Feb 2025 18:04:03 +0100 Subject: mfd: lp3943: Drop #include from header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The header doesn't make use of any symbols declared in . There are tree files that #include . Two of them (i.e. drivers/gpio/gpio-lp3943.c and drivers/mfd/lp3943.c) also don't use any and the third (drivers/pwm/pwm-lp3943.c) has an explicit include of itself. So drop the unused include. The intended side effect is that drivers/gpio/gpio-lp3943.c and drivers/mfd/lp3943.c stop importing the "PWM" module namespace. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20250212170403.36619-2-u.kleine-koenig@baylibre.com Signed-off-by: Lee Jones --- include/linux/mfd/lp3943.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/lp3943.h b/include/linux/mfd/lp3943.h index 020a339f96e8..402f01078fcc 100644 --- a/include/linux/mfd/lp3943.h +++ b/include/linux/mfd/lp3943.h @@ -11,7 +11,6 @@ #define __MFD_LP3943_H__ #include -#include #include /* Registers */ -- cgit v1.2.3 From 98cf2d50391097b642783528ef08845c7f0d9e04 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 3 Mar 2025 11:00:55 +0100 Subject: mfd: Remove STA2x11 core driver With commit dcbb01fbb7ae ("x86/pci: Remove old STA2x11 support"), the core driver for STA2x11 is not needed and cannot be built anymore. Remove the driver and its header file. Signed-off-by: Lukas Bulwahn Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250303100055.372689-1-lukas.bulwahn@redhat.com Signed-off-by: Lee Jones --- include/linux/mfd/sta2x11-mfd.h | 506 ---------------------------------------- 1 file changed, 506 deletions(-) delete mode 100644 include/linux/mfd/sta2x11-mfd.h (limited to 'include/linux') diff --git a/include/linux/mfd/sta2x11-mfd.h b/include/linux/mfd/sta2x11-mfd.h deleted file mode 100644 index 2001ca5c44a9..000000000000 --- a/include/linux/mfd/sta2x11-mfd.h +++ /dev/null @@ -1,506 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2009-2011 Wind River Systems, Inc. - * Copyright (c) 2011 ST Microelectronics (Alessandro Rubini) - * - * The STMicroelectronics ConneXt (STA2X11) chip has several unrelated - * functions in one PCI endpoint functions. This driver simply - * registers the platform devices in this iomemregion and exports a few - * functions to access common registers - */ - -#ifndef __STA2X11_MFD_H -#define __STA2X11_MFD_H -#include -#include - -enum sta2x11_mfd_plat_dev { - sta2x11_sctl = 0, - sta2x11_gpio, - sta2x11_scr, - sta2x11_time, - sta2x11_apbreg, - sta2x11_apb_soc_regs, - sta2x11_vic, - sta2x11_n_mfd_plat_devs, -}; - -#define STA2X11_MFD_SCTL_NAME "sta2x11-sctl" -#define STA2X11_MFD_GPIO_NAME "sta2x11-gpio" -#define STA2X11_MFD_SCR_NAME "sta2x11-scr" -#define STA2X11_MFD_TIME_NAME "sta2x11-time" -#define STA2X11_MFD_APBREG_NAME "sta2x11-apbreg" -#define STA2X11_MFD_APB_SOC_REGS_NAME "sta2x11-apb-soc-regs" -#define STA2X11_MFD_VIC_NAME "sta2x11-vic" - -extern u32 -__sta2x11_mfd_mask(struct pci_dev *, u32, u32, u32, enum sta2x11_mfd_plat_dev); - -/* - * The MFD PCI block includes the GPIO peripherals and other register blocks. - * For GPIO, we have 32*4 bits (I use "gsta" for "gpio sta2x11".) - */ -#define GSTA_GPIO_PER_BLOCK 32 -#define GSTA_NR_BLOCKS 4 -#define GSTA_NR_GPIO (GSTA_GPIO_PER_BLOCK * GSTA_NR_BLOCKS) - -/* Pinconfig is set by the board definition: altfunc, pull-up, pull-down */ -struct sta2x11_gpio_pdata { - unsigned pinconfig[GSTA_NR_GPIO]; -}; - -/* Macros below lifted from sh_pfc.h, with minor differences */ -#define PINMUX_TYPE_NONE 0 -#define PINMUX_TYPE_FUNCTION 1 -#define PINMUX_TYPE_OUTPUT_LOW 2 -#define PINMUX_TYPE_OUTPUT_HIGH 3 -#define PINMUX_TYPE_INPUT 4 -#define PINMUX_TYPE_INPUT_PULLUP 5 -#define PINMUX_TYPE_INPUT_PULLDOWN 6 - -/* Give names to GPIO pins, like PXA does, taken from the manual */ -#define STA2X11_GPIO0 0 -#define STA2X11_GPIO1 1 -#define STA2X11_GPIO2 2 -#define STA2X11_GPIO3 3 -#define STA2X11_GPIO4 4 -#define STA2X11_GPIO5 5 -#define STA2X11_GPIO6 6 -#define STA2X11_GPIO7 7 -#define STA2X11_GPIO8_RGBOUT_RED7 8 -#define STA2X11_GPIO9_RGBOUT_RED6 9 -#define STA2X11_GPIO10_RGBOUT_RED5 10 -#define STA2X11_GPIO11_RGBOUT_RED4 11 -#define STA2X11_GPIO12_RGBOUT_RED3 12 -#define STA2X11_GPIO13_RGBOUT_RED2 13 -#define STA2X11_GPIO14_RGBOUT_RED1 14 -#define STA2X11_GPIO15_RGBOUT_RED0 15 -#define STA2X11_GPIO16_RGBOUT_GREEN7 16 -#define STA2X11_GPIO17_RGBOUT_GREEN6 17 -#define STA2X11_GPIO18_RGBOUT_GREEN5 18 -#define STA2X11_GPIO19_RGBOUT_GREEN4 19 -#define STA2X11_GPIO20_RGBOUT_GREEN3 20 -#define STA2X11_GPIO21_RGBOUT_GREEN2 21 -#define STA2X11_GPIO22_RGBOUT_GREEN1 22 -#define STA2X11_GPIO23_RGBOUT_GREEN0 23 -#define STA2X11_GPIO24_RGBOUT_BLUE7 24 -#define STA2X11_GPIO25_RGBOUT_BLUE6 25 -#define STA2X11_GPIO26_RGBOUT_BLUE5 26 -#define STA2X11_GPIO27_RGBOUT_BLUE4 27 -#define STA2X11_GPIO28_RGBOUT_BLUE3 28 -#define STA2X11_GPIO29_RGBOUT_BLUE2 29 -#define STA2X11_GPIO30_RGBOUT_BLUE1 30 -#define STA2X11_GPIO31_RGBOUT_BLUE0 31 -#define STA2X11_GPIO32_RGBOUT_VSYNCH 32 -#define STA2X11_GPIO33_RGBOUT_HSYNCH 33 -#define STA2X11_GPIO34_RGBOUT_DEN 34 -#define STA2X11_GPIO35_ETH_CRS_DV 35 -#define STA2X11_GPIO36_ETH_TXD1 36 -#define STA2X11_GPIO37_ETH_TXD0 37 -#define STA2X11_GPIO38_ETH_TX_EN 38 -#define STA2X11_GPIO39_MDIO 39 -#define STA2X11_GPIO40_ETH_REF_CLK 40 -#define STA2X11_GPIO41_ETH_RXD1 41 -#define STA2X11_GPIO42_ETH_RXD0 42 -#define STA2X11_GPIO43_MDC 43 -#define STA2X11_GPIO44_CAN_TX 44 -#define STA2X11_GPIO45_CAN_RX 45 -#define STA2X11_GPIO46_MLB_DAT 46 -#define STA2X11_GPIO47_MLB_SIG 47 -#define STA2X11_GPIO48_SPI0_CLK 48 -#define STA2X11_GPIO49_SPI0_TXD 49 -#define STA2X11_GPIO50_SPI0_RXD 50 -#define STA2X11_GPIO51_SPI0_FRM 51 -#define STA2X11_GPIO52_SPI1_CLK 52 -#define STA2X11_GPIO53_SPI1_TXD 53 -#define STA2X11_GPIO54_SPI1_RXD 54 -#define STA2X11_GPIO55_SPI1_FRM 55 -#define STA2X11_GPIO56_SPI2_CLK 56 -#define STA2X11_GPIO57_SPI2_TXD 57 -#define STA2X11_GPIO58_SPI2_RXD 58 -#define STA2X11_GPIO59_SPI2_FRM 59 -#define STA2X11_GPIO60_I2C0_SCL 60 -#define STA2X11_GPIO61_I2C0_SDA 61 -#define STA2X11_GPIO62_I2C1_SCL 62 -#define STA2X11_GPIO63_I2C1_SDA 63 -#define STA2X11_GPIO64_I2C2_SCL 64 -#define STA2X11_GPIO65_I2C2_SDA 65 -#define STA2X11_GPIO66_I2C3_SCL 66 -#define STA2X11_GPIO67_I2C3_SDA 67 -#define STA2X11_GPIO68_MSP0_RCK 68 -#define STA2X11_GPIO69_MSP0_RXD 69 -#define STA2X11_GPIO70_MSP0_RFS 70 -#define STA2X11_GPIO71_MSP0_TCK 71 -#define STA2X11_GPIO72_MSP0_TXD 72 -#define STA2X11_GPIO73_MSP0_TFS 73 -#define STA2X11_GPIO74_MSP0_SCK 74 -#define STA2X11_GPIO75_MSP1_CK 75 -#define STA2X11_GPIO76_MSP1_RXD 76 -#define STA2X11_GPIO77_MSP1_FS 77 -#define STA2X11_GPIO78_MSP1_TXD 78 -#define STA2X11_GPIO79_MSP2_CK 79 -#define STA2X11_GPIO80_MSP2_RXD 80 -#define STA2X11_GPIO81_MSP2_FS 81 -#define STA2X11_GPIO82_MSP2_TXD 82 -#define STA2X11_GPIO83_MSP3_CK 83 -#define STA2X11_GPIO84_MSP3_RXD 84 -#define STA2X11_GPIO85_MSP3_FS 85 -#define STA2X11_GPIO86_MSP3_TXD 86 -#define STA2X11_GPIO87_MSP4_CK 87 -#define STA2X11_GPIO88_MSP4_RXD 88 -#define STA2X11_GPIO89_MSP4_FS 89 -#define STA2X11_GPIO90_MSP4_TXD 90 -#define STA2X11_GPIO91_MSP5_CK 91 -#define STA2X11_GPIO92_MSP5_RXD 92 -#define STA2X11_GPIO93_MSP5_FS 93 -#define STA2X11_GPIO94_MSP5_TXD 94 -#define STA2X11_GPIO95_SDIO3_DAT3 95 -#define STA2X11_GPIO96_SDIO3_DAT2 96 -#define STA2X11_GPIO97_SDIO3_DAT1 97 -#define STA2X11_GPIO98_SDIO3_DAT0 98 -#define STA2X11_GPIO99_SDIO3_CLK 99 -#define STA2X11_GPIO100_SDIO3_CMD 100 -#define STA2X11_GPIO101 101 -#define STA2X11_GPIO102 102 -#define STA2X11_GPIO103 103 -#define STA2X11_GPIO104 104 -#define STA2X11_GPIO105_SDIO2_DAT3 105 -#define STA2X11_GPIO106_SDIO2_DAT2 106 -#define STA2X11_GPIO107_SDIO2_DAT1 107 -#define STA2X11_GPIO108_SDIO2_DAT0 108 -#define STA2X11_GPIO109_SDIO2_CLK 109 -#define STA2X11_GPIO110_SDIO2_CMD 110 -#define STA2X11_GPIO111 111 -#define STA2X11_GPIO112 112 -#define STA2X11_GPIO113 113 -#define STA2X11_GPIO114 114 -#define STA2X11_GPIO115_SDIO1_DAT3 115 -#define STA2X11_GPIO116_SDIO1_DAT2 116 -#define STA2X11_GPIO117_SDIO1_DAT1 117 -#define STA2X11_GPIO118_SDIO1_DAT0 118 -#define STA2X11_GPIO119_SDIO1_CLK 119 -#define STA2X11_GPIO120_SDIO1_CMD 120 -#define STA2X11_GPIO121 121 -#define STA2X11_GPIO122 122 -#define STA2X11_GPIO123 123 -#define STA2X11_GPIO124 124 -#define STA2X11_GPIO125_UART2_TXD 125 -#define STA2X11_GPIO126_UART2_RXD 126 -#define STA2X11_GPIO127_UART3_TXD 127 - -/* - * The APB bridge has its own registers, needed by our users as well. - * They are accessed with the following read/mask/write function. - */ -static inline u32 -sta2x11_apbreg_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val) -{ - return __sta2x11_mfd_mask(pdev, reg, mask, val, sta2x11_apbreg); -} - -/* CAN and MLB */ -#define APBREG_BSR 0x00 /* Bridge Status Reg */ -#define APBREG_PAER 0x08 /* Peripherals Address Error Reg */ -#define APBREG_PWAC 0x20 /* Peripheral Write Access Control reg */ -#define APBREG_PRAC 0x40 /* Peripheral Read Access Control reg */ -#define APBREG_PCG 0x60 /* Peripheral Clock Gating Reg */ -#define APBREG_PUR 0x80 /* Peripheral Under Reset Reg */ -#define APBREG_EMU_PCG 0xA0 /* Emulator Peripheral Clock Gating Reg */ - -#define APBREG_CAN (1 << 1) -#define APBREG_MLB (1 << 3) - -/* SARAC */ -#define APBREG_BSR_SARAC 0x100 /* Bridge Status Reg */ -#define APBREG_PAER_SARAC 0x108 /* Peripherals Address Error Reg */ -#define APBREG_PWAC_SARAC 0x120 /* Peripheral Write Access Control reg */ -#define APBREG_PRAC_SARAC 0x140 /* Peripheral Read Access Control reg */ -#define APBREG_PCG_SARAC 0x160 /* Peripheral Clock Gating Reg */ -#define APBREG_PUR_SARAC 0x180 /* Peripheral Under Reset Reg */ -#define APBREG_EMU_PCG_SARAC 0x1A0 /* Emulator Peripheral Clock Gating Reg */ - -#define APBREG_SARAC (1 << 2) - -/* - * The system controller has its own registers. Some of these are accessed - * by out users as well, using the following read/mask/write/function - */ -static inline -u32 sta2x11_sctl_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val) -{ - return __sta2x11_mfd_mask(pdev, reg, mask, val, sta2x11_sctl); -} - -#define SCTL_SCCTL 0x00 /* System controller control register */ -#define SCTL_ARMCFG 0x04 /* ARM configuration register */ -#define SCTL_SCPLLCTL 0x08 /* PLL control status register */ - -#define SCTL_SCPLLCTL_AUDIO_PLL_PD BIT(1) -#define SCTL_SCPLLCTL_FRAC_CONTROL BIT(3) -#define SCTL_SCPLLCTL_STRB_BYPASS BIT(6) -#define SCTL_SCPLLCTL_STRB_INPUT BIT(8) - -#define SCTL_SCPLLFCTRL 0x0c /* PLL frequency control register */ - -#define SCTL_SCPLLFCTRL_AUDIO_PLL_NDIV_MASK 0xff -#define SCTL_SCPLLFCTRL_AUDIO_PLL_NDIV_SHIFT 10 -#define SCTL_SCPLLFCTRL_AUDIO_PLL_IDF_MASK 7 -#define SCTL_SCPLLFCTRL_AUDIO_PLL_IDF_SHIFT 21 -#define SCTL_SCPLLFCTRL_AUDIO_PLL_ODF_MASK 7 -#define SCTL_SCPLLFCTRL_AUDIO_PLL_ODF_SHIFT 18 -#define SCTL_SCPLLFCTRL_DITHER_DISABLE_MASK 0x03 -#define SCTL_SCPLLFCTRL_DITHER_DISABLE_SHIFT 4 - - -#define SCTL_SCRESFRACT 0x10 /* PLL fractional input register */ - -#define SCTL_SCRESFRACT_MASK 0x0000ffff - - -#define SCTL_SCRESCTRL1 0x14 /* Peripheral reset control 1 */ -#define SCTL_SCRESXTRL2 0x18 /* Peripheral reset control 2 */ -#define SCTL_SCPEREN0 0x1c /* Peripheral clock enable register 0 */ -#define SCTL_SCPEREN1 0x20 /* Peripheral clock enable register 1 */ -#define SCTL_SCPEREN2 0x24 /* Peripheral clock enable register 2 */ -#define SCTL_SCGRST 0x28 /* Peripheral global reset */ -#define SCTL_SCPCIECSBRST 0x2c /* PCIe PAB CSB reset status register */ -#define SCTL_SCPCIPMCR1 0x30 /* PCI power management control 1 */ -#define SCTL_SCPCIPMCR2 0x34 /* PCI power management control 2 */ -#define SCTL_SCPCIPMSR1 0x38 /* PCI power management status 1 */ -#define SCTL_SCPCIPMSR2 0x3c /* PCI power management status 2 */ -#define SCTL_SCPCIPMSR3 0x40 /* PCI power management status 3 */ -#define SCTL_SCINTREN 0x44 /* Interrupt enable */ -#define SCTL_SCRISR 0x48 /* RAW interrupt status */ -#define SCTL_SCCLKSTAT0 0x4c /* Peripheral clocks status 0 */ -#define SCTL_SCCLKSTAT1 0x50 /* Peripheral clocks status 1 */ -#define SCTL_SCCLKSTAT2 0x54 /* Peripheral clocks status 2 */ -#define SCTL_SCRSTSTA 0x58 /* Reset status register */ - -#define SCTL_SCRESCTRL1_USB_PHY_POR (1 << 0) -#define SCTL_SCRESCTRL1_USB_OTG (1 << 1) -#define SCTL_SCRESCTRL1_USB_HRST (1 << 2) -#define SCTL_SCRESCTRL1_USB_PHY_HOST (1 << 3) -#define SCTL_SCRESCTRL1_SATAII (1 << 4) -#define SCTL_SCRESCTRL1_VIP (1 << 5) -#define SCTL_SCRESCTRL1_PER_MMC0 (1 << 6) -#define SCTL_SCRESCTRL1_PER_MMC1 (1 << 7) -#define SCTL_SCRESCTRL1_PER_GPIO0 (1 << 8) -#define SCTL_SCRESCTRL1_PER_GPIO1 (1 << 9) -#define SCTL_SCRESCTRL1_PER_GPIO2 (1 << 10) -#define SCTL_SCRESCTRL1_PER_GPIO3 (1 << 11) -#define SCTL_SCRESCTRL1_PER_MTU0 (1 << 12) -#define SCTL_SCRESCTRL1_KER_SPI0 (1 << 13) -#define SCTL_SCRESCTRL1_KER_SPI1 (1 << 14) -#define SCTL_SCRESCTRL1_KER_SPI2 (1 << 15) -#define SCTL_SCRESCTRL1_KER_MCI0 (1 << 16) -#define SCTL_SCRESCTRL1_KER_MCI1 (1 << 17) -#define SCTL_SCRESCTRL1_PRE_HSI2C0 (1 << 18) -#define SCTL_SCRESCTRL1_PER_HSI2C1 (1 << 19) -#define SCTL_SCRESCTRL1_PER_HSI2C2 (1 << 20) -#define SCTL_SCRESCTRL1_PER_HSI2C3 (1 << 21) -#define SCTL_SCRESCTRL1_PER_MSP0 (1 << 22) -#define SCTL_SCRESCTRL1_PER_MSP1 (1 << 23) -#define SCTL_SCRESCTRL1_PER_MSP2 (1 << 24) -#define SCTL_SCRESCTRL1_PER_MSP3 (1 << 25) -#define SCTL_SCRESCTRL1_PER_MSP4 (1 << 26) -#define SCTL_SCRESCTRL1_PER_MSP5 (1 << 27) -#define SCTL_SCRESCTRL1_PER_MMC (1 << 28) -#define SCTL_SCRESCTRL1_KER_MSP0 (1 << 29) -#define SCTL_SCRESCTRL1_KER_MSP1 (1 << 30) -#define SCTL_SCRESCTRL1_KER_MSP2 (1 << 31) - -#define SCTL_SCPEREN0_UART0 (1 << 0) -#define SCTL_SCPEREN0_UART1 (1 << 1) -#define SCTL_SCPEREN0_UART2 (1 << 2) -#define SCTL_SCPEREN0_UART3 (1 << 3) -#define SCTL_SCPEREN0_MSP0 (1 << 4) -#define SCTL_SCPEREN0_MSP1 (1 << 5) -#define SCTL_SCPEREN0_MSP2 (1 << 6) -#define SCTL_SCPEREN0_MSP3 (1 << 7) -#define SCTL_SCPEREN0_MSP4 (1 << 8) -#define SCTL_SCPEREN0_MSP5 (1 << 9) -#define SCTL_SCPEREN0_SPI0 (1 << 10) -#define SCTL_SCPEREN0_SPI1 (1 << 11) -#define SCTL_SCPEREN0_SPI2 (1 << 12) -#define SCTL_SCPEREN0_I2C0 (1 << 13) -#define SCTL_SCPEREN0_I2C1 (1 << 14) -#define SCTL_SCPEREN0_I2C2 (1 << 15) -#define SCTL_SCPEREN0_I2C3 (1 << 16) -#define SCTL_SCPEREN0_SVDO_LVDS (1 << 17) -#define SCTL_SCPEREN0_USB_HOST (1 << 18) -#define SCTL_SCPEREN0_USB_OTG (1 << 19) -#define SCTL_SCPEREN0_MCI0 (1 << 20) -#define SCTL_SCPEREN0_MCI1 (1 << 21) -#define SCTL_SCPEREN0_MCI2 (1 << 22) -#define SCTL_SCPEREN0_MCI3 (1 << 23) -#define SCTL_SCPEREN0_SATA (1 << 24) -#define SCTL_SCPEREN0_ETHERNET (1 << 25) -#define SCTL_SCPEREN0_VIC (1 << 26) -#define SCTL_SCPEREN0_DMA_AUDIO (1 << 27) -#define SCTL_SCPEREN0_DMA_SOC (1 << 28) -#define SCTL_SCPEREN0_RAM (1 << 29) -#define SCTL_SCPEREN0_VIP (1 << 30) -#define SCTL_SCPEREN0_ARM (1 << 31) - -#define SCTL_SCPEREN1_UART0 (1 << 0) -#define SCTL_SCPEREN1_UART1 (1 << 1) -#define SCTL_SCPEREN1_UART2 (1 << 2) -#define SCTL_SCPEREN1_UART3 (1 << 3) -#define SCTL_SCPEREN1_MSP0 (1 << 4) -#define SCTL_SCPEREN1_MSP1 (1 << 5) -#define SCTL_SCPEREN1_MSP2 (1 << 6) -#define SCTL_SCPEREN1_MSP3 (1 << 7) -#define SCTL_SCPEREN1_MSP4 (1 << 8) -#define SCTL_SCPEREN1_MSP5 (1 << 9) -#define SCTL_SCPEREN1_SPI0 (1 << 10) -#define SCTL_SCPEREN1_SPI1 (1 << 11) -#define SCTL_SCPEREN1_SPI2 (1 << 12) -#define SCTL_SCPEREN1_I2C0 (1 << 13) -#define SCTL_SCPEREN1_I2C1 (1 << 14) -#define SCTL_SCPEREN1_I2C2 (1 << 15) -#define SCTL_SCPEREN1_I2C3 (1 << 16) -#define SCTL_SCPEREN1_USB_PHY (1 << 17) - -/* - * APB-SOC registers - */ -static inline -u32 sta2x11_apb_soc_regs_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val) -{ - return __sta2x11_mfd_mask(pdev, reg, mask, val, sta2x11_apb_soc_regs); -} - -#define PCIE_EP1_FUNC3_0_INTR_REG 0x000 -#define PCIE_EP1_FUNC7_4_INTR_REG 0x004 -#define PCIE_EP2_FUNC3_0_INTR_REG 0x008 -#define PCIE_EP2_FUNC7_4_INTR_REG 0x00c -#define PCIE_EP3_FUNC3_0_INTR_REG 0x010 -#define PCIE_EP3_FUNC7_4_INTR_REG 0x014 -#define PCIE_EP4_FUNC3_0_INTR_REG 0x018 -#define PCIE_EP4_FUNC7_4_INTR_REG 0x01c -#define PCIE_INTR_ENABLE0_REG 0x020 -#define PCIE_INTR_ENABLE1_REG 0x024 -#define PCIE_EP1_FUNC_TC_REG 0x028 -#define PCIE_EP2_FUNC_TC_REG 0x02c -#define PCIE_EP3_FUNC_TC_REG 0x030 -#define PCIE_EP4_FUNC_TC_REG 0x034 -#define PCIE_EP1_FUNC_F_REG 0x038 -#define PCIE_EP2_FUNC_F_REG 0x03c -#define PCIE_EP3_FUNC_F_REG 0x040 -#define PCIE_EP4_FUNC_F_REG 0x044 -#define PCIE_PAB_AMBA_SW_RST_REG 0x048 -#define PCIE_PM_STATUS_0_PORT_0_4 0x04c -#define PCIE_PM_STATUS_7_0_EP1 0x050 -#define PCIE_PM_STATUS_7_0_EP2 0x054 -#define PCIE_PM_STATUS_7_0_EP3 0x058 -#define PCIE_PM_STATUS_7_0_EP4 0x05c -#define PCIE_DEV_ID_0_EP1_REG 0x060 -#define PCIE_CC_REV_ID_0_EP1_REG 0x064 -#define PCIE_DEV_ID_1_EP1_REG 0x068 -#define PCIE_CC_REV_ID_1_EP1_REG 0x06c -#define PCIE_DEV_ID_2_EP1_REG 0x070 -#define PCIE_CC_REV_ID_2_EP1_REG 0x074 -#define PCIE_DEV_ID_3_EP1_REG 0x078 -#define PCIE_CC_REV_ID_3_EP1_REG 0x07c -#define PCIE_DEV_ID_4_EP1_REG 0x080 -#define PCIE_CC_REV_ID_4_EP1_REG 0x084 -#define PCIE_DEV_ID_5_EP1_REG 0x088 -#define PCIE_CC_REV_ID_5_EP1_REG 0x08c -#define PCIE_DEV_ID_6_EP1_REG 0x090 -#define PCIE_CC_REV_ID_6_EP1_REG 0x094 -#define PCIE_DEV_ID_7_EP1_REG 0x098 -#define PCIE_CC_REV_ID_7_EP1_REG 0x09c -#define PCIE_DEV_ID_0_EP2_REG 0x0a0 -#define PCIE_CC_REV_ID_0_EP2_REG 0x0a4 -#define PCIE_DEV_ID_1_EP2_REG 0x0a8 -#define PCIE_CC_REV_ID_1_EP2_REG 0x0ac -#define PCIE_DEV_ID_2_EP2_REG 0x0b0 -#define PCIE_CC_REV_ID_2_EP2_REG 0x0b4 -#define PCIE_DEV_ID_3_EP2_REG 0x0b8 -#define PCIE_CC_REV_ID_3_EP2_REG 0x0bc -#define PCIE_DEV_ID_4_EP2_REG 0x0c0 -#define PCIE_CC_REV_ID_4_EP2_REG 0x0c4 -#define PCIE_DEV_ID_5_EP2_REG 0x0c8 -#define PCIE_CC_REV_ID_5_EP2_REG 0x0cc -#define PCIE_DEV_ID_6_EP2_REG 0x0d0 -#define PCIE_CC_REV_ID_6_EP2_REG 0x0d4 -#define PCIE_DEV_ID_7_EP2_REG 0x0d8 -#define PCIE_CC_REV_ID_7_EP2_REG 0x0dC -#define PCIE_DEV_ID_0_EP3_REG 0x0e0 -#define PCIE_CC_REV_ID_0_EP3_REG 0x0e4 -#define PCIE_DEV_ID_1_EP3_REG 0x0e8 -#define PCIE_CC_REV_ID_1_EP3_REG 0x0ec -#define PCIE_DEV_ID_2_EP3_REG 0x0f0 -#define PCIE_CC_REV_ID_2_EP3_REG 0x0f4 -#define PCIE_DEV_ID_3_EP3_REG 0x0f8 -#define PCIE_CC_REV_ID_3_EP3_REG 0x0fc -#define PCIE_DEV_ID_4_EP3_REG 0x100 -#define PCIE_CC_REV_ID_4_EP3_REG 0x104 -#define PCIE_DEV_ID_5_EP3_REG 0x108 -#define PCIE_CC_REV_ID_5_EP3_REG 0x10c -#define PCIE_DEV_ID_6_EP3_REG 0x110 -#define PCIE_CC_REV_ID_6_EP3_REG 0x114 -#define PCIE_DEV_ID_7_EP3_REG 0x118 -#define PCIE_CC_REV_ID_7_EP3_REG 0x11c -#define PCIE_DEV_ID_0_EP4_REG 0x120 -#define PCIE_CC_REV_ID_0_EP4_REG 0x124 -#define PCIE_DEV_ID_1_EP4_REG 0x128 -#define PCIE_CC_REV_ID_1_EP4_REG 0x12c -#define PCIE_DEV_ID_2_EP4_REG 0x130 -#define PCIE_CC_REV_ID_2_EP4_REG 0x134 -#define PCIE_DEV_ID_3_EP4_REG 0x138 -#define PCIE_CC_REV_ID_3_EP4_REG 0x13c -#define PCIE_DEV_ID_4_EP4_REG 0x140 -#define PCIE_CC_REV_ID_4_EP4_REG 0x144 -#define PCIE_DEV_ID_5_EP4_REG 0x148 -#define PCIE_CC_REV_ID_5_EP4_REG 0x14c -#define PCIE_DEV_ID_6_EP4_REG 0x150 -#define PCIE_CC_REV_ID_6_EP4_REG 0x154 -#define PCIE_DEV_ID_7_EP4_REG 0x158 -#define PCIE_CC_REV_ID_7_EP4_REG 0x15c -#define PCIE_SUBSYS_VEN_ID_REG 0x160 -#define PCIE_COMMON_CLOCK_CONFIG_0_4_0 0x164 -#define PCIE_MIPHYP_SSC_EN_REG 0x168 -#define PCIE_MIPHYP_ADDR_REG 0x16c -#define PCIE_L1_ASPM_READY_REG 0x170 -#define PCIE_EXT_CFG_RDY_REG 0x174 -#define PCIE_SoC_INT_ROUTER_STATUS0_REG 0x178 -#define PCIE_SoC_INT_ROUTER_STATUS1_REG 0x17c -#define PCIE_SoC_INT_ROUTER_STATUS2_REG 0x180 -#define PCIE_SoC_INT_ROUTER_STATUS3_REG 0x184 -#define DMA_IP_CTRL_REG 0x324 -#define DISP_BRIDGE_PU_PD_CTRL_REG 0x328 -#define VIP_PU_PD_CTRL_REG 0x32c -#define USB_MLB_PU_PD_CTRL_REG 0x330 -#define SDIO_PU_PD_MISCFUNC_CTRL_REG1 0x334 -#define SDIO_PU_PD_MISCFUNC_CTRL_REG2 0x338 -#define UART_PU_PD_CTRL_REG 0x33c -#define ARM_Lock 0x340 -#define SYS_IO_CHAR_REG1 0x344 -#define SYS_IO_CHAR_REG2 0x348 -#define SATA_CORE_ID_REG 0x34c -#define SATA_CTRL_REG 0x350 -#define I2C_HSFIX_MISC_REG 0x354 -#define SPARE2_RESERVED 0x358 -#define SPARE3_RESERVED 0x35c -#define MASTER_LOCK_REG 0x368 -#define SYSTEM_CONFIG_STATUS_REG 0x36c -#define MSP_CLK_CTRL_REG 0x39c -#define COMPENSATION_REG1 0x3c4 -#define COMPENSATION_REG2 0x3c8 -#define COMPENSATION_REG3 0x3cc -#define TEST_CTL_REG 0x3d0 - -/* - * SECR (OTP) registers - */ -#define STA2X11_SECR_CR 0x00 -#define STA2X11_SECR_FVR0 0x10 -#define STA2X11_SECR_FVR1 0x14 - -extern int sta2x11_mfd_get_regs_data(struct platform_device *pdev, - enum sta2x11_mfd_plat_dev index, - void __iomem **regs, - spinlock_t **lock); - -#endif /* __STA2X11_MFD_H */ -- cgit v1.2.3 From c105c555f8b4fd57b09439806b43b97ebc240ee2 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 21 Feb 2025 05:02:20 -0800 Subject: mfd: db8500-prcmu: Remove needless return in three void APIs Remove needless 'return' in the following void APIs: prcmu_early_init() prcmu_system_reset() prcmu_modem_reset() Since both the API and callee involved are void functions. Signed-off-by: Zijun Hu Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250221-rmv_return-v1-15-cc8dff275827@quicinc.com Signed-off-by: Lee Jones --- include/linux/mfd/dbx500-prcmu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h index dd0fc891b228..98567623c9df 100644 --- a/include/linux/mfd/dbx500-prcmu.h +++ b/include/linux/mfd/dbx500-prcmu.h @@ -215,7 +215,7 @@ struct prcmu_fw_version { static inline void prcmu_early_init(void) { - return db8500_prcmu_early_init(); + db8500_prcmu_early_init(); } static inline int prcmu_set_power_state(u8 state, bool keep_ulp_clk, @@ -302,7 +302,7 @@ static inline int prcmu_request_ape_opp_100_voltage(bool enable) static inline void prcmu_system_reset(u16 reset_code) { - return db8500_prcmu_system_reset(reset_code); + db8500_prcmu_system_reset(reset_code); } static inline u16 prcmu_get_reset_code(void) @@ -314,7 +314,7 @@ int prcmu_ac_wake_req(void); void prcmu_ac_sleep_req(void); static inline void prcmu_modem_reset(void) { - return db8500_prcmu_modem_reset(); + db8500_prcmu_modem_reset(); } static inline bool prcmu_is_ac_wake_requested(void) -- cgit v1.2.3 From 0d1217ab7fc0fec40dcebf57fdeff668ce495bda Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 6 Mar 2025 01:10:27 +0000 Subject: mfd: ezx-pcap: Remove unused pcap_adc_sync pcap_adc_sync() was added in 2009 by commit 13a09f93d2bf ("mfd: add PCAP driver") but has remained unused; the async version is still used. Remove it. Signed-off-by: "Dr. David Alan Gilbert" Link: https://lore.kernel.org/r/20250306011027.257021-1-linux@treblig.org Signed-off-by: Lee Jones --- include/linux/mfd/ezx-pcap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h index ffde195e12b7..ea51b1cdca5a 100644 --- a/include/linux/mfd/ezx-pcap.h +++ b/include/linux/mfd/ezx-pcap.h @@ -31,7 +31,6 @@ int ezx_pcap_set_bits(struct pcap_chip *, u8, u32, u32); int pcap_to_irq(struct pcap_chip *, int); int irq_to_pcap(struct pcap_chip *, int); int pcap_adc_async(struct pcap_chip *, u8, u32, u8[], void *, void *); -int pcap_adc_sync(struct pcap_chip *, u8, u32, u8[], u16[]); void pcap_set_ts_bits(struct pcap_chip *, u32); #define PCAP_SECOND_PORT 1 -- cgit v1.2.3 From dfc034a0494b8fb8ea881aeb41a0c4e2619ff1e4 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 11 Mar 2025 01:49:57 +0000 Subject: backlight: pcf50633-backlight: Remove unused driver The pcf50633 was used as part of the OpenMoko devices but the support for its main chip was recently removed in: commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support") See https://lore.kernel.org/all/Z8z236h4B5A6Ki3D@gallifrey/ Remove it. Signed-off-by: "Dr. David Alan Gilbert" Link: https://lore.kernel.org/r/20250311014959.743322-8-linux@treblig.org Signed-off-by: Lee Jones --- include/linux/mfd/pcf50633/backlight.h | 42 ---------------------------------- include/linux/mfd/pcf50633/core.h | 3 --- 2 files changed, 45 deletions(-) delete mode 100644 include/linux/mfd/pcf50633/backlight.h (limited to 'include/linux') diff --git a/include/linux/mfd/pcf50633/backlight.h b/include/linux/mfd/pcf50633/backlight.h deleted file mode 100644 index fd4a4f8d6c13..000000000000 --- a/include/linux/mfd/pcf50633/backlight.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2009-2010, Lars-Peter Clausen - * PCF50633 backlight device driver - */ - -#ifndef __LINUX_MFD_PCF50633_BACKLIGHT -#define __LINUX_MFD_PCF50633_BACKLIGHT - -/* -* @default_brightness: Backlight brightness is initialized to this value -* -* Brightness to be used after the driver has been probed. -* Valid range 0-63. -* -* @default_brightness_limit: The actual brightness is limited by this value -* -* Brightness limit to be used after the driver has been probed. This is useful -* when it is not known how much power is available for the backlight during -* probe. -* Valid range 0-63. Can be changed later with pcf50633_bl_set_brightness_limit. -* -* @ramp_time: Display ramp time when changing brightness -* -* When changing the backlights brightness the change is not instant, instead -* it fades smooth from one state to another. This value specifies how long -* the fade should take. The lower the value the higher the fade time. -* Valid range 0-255 -*/ -struct pcf50633_bl_platform_data { - unsigned int default_brightness; - unsigned int default_brightness_limit; - uint8_t ramp_time; -}; - - -struct pcf50633; - -int pcf50633_bl_set_brightness_limit(struct pcf50633 *pcf, unsigned int limit); - -#endif - diff --git a/include/linux/mfd/pcf50633/core.h b/include/linux/mfd/pcf50633/core.h index 539f27f8bd89..42d2b0e4884e 100644 --- a/include/linux/mfd/pcf50633/core.h +++ b/include/linux/mfd/pcf50633/core.h @@ -15,7 +15,6 @@ #include #include #include -#include struct pcf50633; struct regmap; @@ -42,8 +41,6 @@ struct pcf50633_platform_data { void (*force_shutdown)(struct pcf50633 *); u8 resumers[5]; - - struct pcf50633_bl_platform_data *backlight_data; }; struct pcf50633_irq { -- cgit v1.2.3 From 0d0e54953805af76f0022df39602f5668145f747 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 11 Mar 2025 01:49:51 +0000 Subject: mfd: pcf50633-adc: Remove unused driver The pcf50633 was used as part of the OpenMoko devices but the support for its main chip was recently removed in: commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support") See https://lore.kernel.org/all/Z8z236h4B5A6Ki3D@gallifrey/ Remove it. Signed-off-by: "Dr. David Alan Gilbert" Link: https://lore.kernel.org/r/20250311014959.743322-2-linux@treblig.org Signed-off-by: Lee Jones --- include/linux/mfd/pcf50633/adc.h | 69 ---------------------------------------- 1 file changed, 69 deletions(-) delete mode 100644 include/linux/mfd/pcf50633/adc.h (limited to 'include/linux') diff --git a/include/linux/mfd/pcf50633/adc.h b/include/linux/mfd/pcf50633/adc.h deleted file mode 100644 index 6a81896d4889..000000000000 --- a/include/linux/mfd/pcf50633/adc.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * adc.h -- Driver for NXP PCF50633 ADC - * - * (C) 2006-2008 by Openmoko, Inc. - * All rights reserved. - */ - -#ifndef __LINUX_MFD_PCF50633_ADC_H -#define __LINUX_MFD_PCF50633_ADC_H - -#include -#include - -/* ADC Registers */ -#define PCF50633_REG_ADCC3 0x52 -#define PCF50633_REG_ADCC2 0x53 -#define PCF50633_REG_ADCC1 0x54 -#define PCF50633_REG_ADCS1 0x55 -#define PCF50633_REG_ADCS2 0x56 -#define PCF50633_REG_ADCS3 0x57 - -#define PCF50633_ADCC1_ADCSTART 0x01 -#define PCF50633_ADCC1_RES_8BIT 0x02 -#define PCF50633_ADCC1_RES_10BIT 0x00 -#define PCF50633_ADCC1_AVERAGE_NO 0x00 -#define PCF50633_ADCC1_AVERAGE_4 0x04 -#define PCF50633_ADCC1_AVERAGE_8 0x08 -#define PCF50633_ADCC1_AVERAGE_16 0x0c -#define PCF50633_ADCC1_MUX_BATSNS_RES 0x00 -#define PCF50633_ADCC1_MUX_BATSNS_SUBTR 0x10 -#define PCF50633_ADCC1_MUX_ADCIN2_RES 0x20 -#define PCF50633_ADCC1_MUX_ADCIN2_SUBTR 0x30 -#define PCF50633_ADCC1_MUX_BATTEMP 0x60 -#define PCF50633_ADCC1_MUX_ADCIN1 0x70 -#define PCF50633_ADCC1_AVERAGE_MASK 0x0c -#define PCF50633_ADCC1_ADCMUX_MASK 0xf0 - -#define PCF50633_ADCC2_RATIO_NONE 0x00 -#define PCF50633_ADCC2_RATIO_BATTEMP 0x01 -#define PCF50633_ADCC2_RATIO_ADCIN1 0x02 -#define PCF50633_ADCC2_RATIO_BOTH 0x03 -#define PCF50633_ADCC2_RATIOSETTL_100US 0x04 - -#define PCF50633_ADCC3_ACCSW_EN 0x01 -#define PCF50633_ADCC3_NTCSW_EN 0x04 -#define PCF50633_ADCC3_RES_DIV_TWO 0x10 -#define PCF50633_ADCC3_RES_DIV_THREE 0x00 - -#define PCF50633_ADCS3_REF_NTCSW 0x00 -#define PCF50633_ADCS3_REF_ACCSW 0x10 -#define PCF50633_ADCS3_REF_2V0 0x20 -#define PCF50633_ADCS3_REF_VISA 0x30 -#define PCF50633_ADCS3_REF_2V0_2 0x70 -#define PCF50633_ADCS3_ADCRDY 0x80 - -#define PCF50633_ADCS3_ADCDAT1L_MASK 0x03 -#define PCF50633_ADCS3_ADCDAT2L_MASK 0x0c -#define PCF50633_ADCS3_ADCDAT2L_SHIFT 2 -#define PCF50633_ASCS3_REF_MASK 0x70 - -extern int -pcf50633_adc_async_read(struct pcf50633 *pcf, int mux, int avg, - void (*callback)(struct pcf50633 *, void *, int), - void *callback_param); -extern int -pcf50633_adc_sync_read(struct pcf50633 *pcf, int mux, int avg); - -#endif /* __LINUX_PCF50633_ADC_H */ -- cgit v1.2.3 From 8559602247d0d054451c7a755942588d2c0de85d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 11 Mar 2025 01:49:53 +0000 Subject: mfd: pcF50633-gpio: Remove unused driver The pcf50633 was used as part of the OpenMoko devices but the support for its main chip was recently removed in: commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support") See https://lore.kernel.org/all/Z8z236h4B5A6Ki3D@gallifrey/ Remove it. Signed-off-by: "Dr. David Alan Gilbert" Acked-by: Linus Walleij Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250311014959.743322-4-linux@treblig.org Signed-off-by: Lee Jones --- include/linux/mfd/pcf50633/gpio.h | 48 --------------------------------------- 1 file changed, 48 deletions(-) delete mode 100644 include/linux/mfd/pcf50633/gpio.h (limited to 'include/linux') diff --git a/include/linux/mfd/pcf50633/gpio.h b/include/linux/mfd/pcf50633/gpio.h deleted file mode 100644 index f589e35795f1..000000000000 --- a/include/linux/mfd/pcf50633/gpio.h +++ /dev/null @@ -1,48 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * gpio.h -- GPIO driver for NXP PCF50633 - * - * (C) 2006-2008 by Openmoko, Inc. - * All rights reserved. - */ - -#ifndef __LINUX_MFD_PCF50633_GPIO_H -#define __LINUX_MFD_PCF50633_GPIO_H - -#include - -#define PCF50633_GPIO1 1 -#define PCF50633_GPIO2 2 -#define PCF50633_GPIO3 3 -#define PCF50633_GPO 4 - -#define PCF50633_REG_GPIO1CFG 0x14 -#define PCF50633_REG_GPIO2CFG 0x15 -#define PCF50633_REG_GPIO3CFG 0x16 -#define PCF50633_REG_GPOCFG 0x17 - -#define PCF50633_GPOCFG_GPOSEL_MASK 0x07 - -enum pcf50633_reg_gpocfg { - PCF50633_GPOCFG_GPOSEL_0 = 0x00, - PCF50633_GPOCFG_GPOSEL_LED_NFET = 0x01, - PCF50633_GPOCFG_GPOSEL_SYSxOK = 0x02, - PCF50633_GPOCFG_GPOSEL_CLK32K = 0x03, - PCF50633_GPOCFG_GPOSEL_ADAPUSB = 0x04, - PCF50633_GPOCFG_GPOSEL_USBxOK = 0x05, - PCF50633_GPOCFG_GPOSEL_ACTPH4 = 0x06, - PCF50633_GPOCFG_GPOSEL_1 = 0x07, - PCF50633_GPOCFG_GPOSEL_INVERSE = 0x08, -}; - -int pcf50633_gpio_set(struct pcf50633 *pcf, int gpio, u8 val); -u8 pcf50633_gpio_get(struct pcf50633 *pcf, int gpio); - -int pcf50633_gpio_invert_set(struct pcf50633 *, int gpio, int invert); -int pcf50633_gpio_invert_get(struct pcf50633 *pcf, int gpio); - -int pcf50633_gpio_power_supply_set(struct pcf50633 *, - int gpio, int regulator, int on); -#endif /* __LINUX_MFD_PCF50633_GPIO_H */ - - -- cgit v1.2.3 From 44356090d59efd8db152e9eecb8e7f843be319f0 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 11 Mar 2025 01:49:59 +0000 Subject: mfd: pcf50633: Remove remaining PCF50633 support Remove the remaining parts of the 50633, the core, headers and glue. The pcf50633 was used as part of the OpenMoko devices but the support for its main chip was recently removed in: commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support") See https://lore.kernel.org/all/Z8z236h4B5A6Ki3D@gallifrey/ Remove it. Signed-off-by: "Dr. David Alan Gilbert" Link: https://lore.kernel.org/r/20250311014959.743322-10-linux@treblig.org Signed-off-by: Lee Jones --- include/linux/mfd/pcf50633/mbc.h | 130 -------------------------------------- include/linux/mfd/pcf50633/pmic.h | 68 -------------------- 2 files changed, 198 deletions(-) delete mode 100644 include/linux/mfd/pcf50633/mbc.h delete mode 100644 include/linux/mfd/pcf50633/pmic.h (limited to 'include/linux') diff --git a/include/linux/mfd/pcf50633/mbc.h b/include/linux/mfd/pcf50633/mbc.h deleted file mode 100644 index fa5cb9256d99..000000000000 --- a/include/linux/mfd/pcf50633/mbc.h +++ /dev/null @@ -1,130 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * mbc.h -- Driver for NXP PCF50633 Main Battery Charger - * - * (C) 2006-2008 by Openmoko, Inc. - * All rights reserved. - */ - -#ifndef __LINUX_MFD_PCF50633_MBC_H -#define __LINUX_MFD_PCF50633_MBC_H - -#include -#include - -#define PCF50633_REG_MBCC1 0x43 -#define PCF50633_REG_MBCC2 0x44 -#define PCF50633_REG_MBCC3 0x45 -#define PCF50633_REG_MBCC4 0x46 -#define PCF50633_REG_MBCC5 0x47 -#define PCF50633_REG_MBCC6 0x48 -#define PCF50633_REG_MBCC7 0x49 -#define PCF50633_REG_MBCC8 0x4a -#define PCF50633_REG_MBCS1 0x4b -#define PCF50633_REG_MBCS2 0x4c -#define PCF50633_REG_MBCS3 0x4d - -enum pcf50633_reg_mbcc1 { - PCF50633_MBCC1_CHGENA = 0x01, /* Charger enable */ - PCF50633_MBCC1_AUTOSTOP = 0x02, - PCF50633_MBCC1_AUTORES = 0x04, /* automatic resume */ - PCF50633_MBCC1_RESUME = 0x08, /* explicit resume cmd */ - PCF50633_MBCC1_RESTART = 0x10, /* restart charging */ - PCF50633_MBCC1_PREWDTIME_60M = 0x20, /* max. precharging time */ - PCF50633_MBCC1_WDTIME_1H = 0x00, - PCF50633_MBCC1_WDTIME_2H = 0x40, - PCF50633_MBCC1_WDTIME_4H = 0x80, - PCF50633_MBCC1_WDTIME_6H = 0xc0, -}; -#define PCF50633_MBCC1_WDTIME_MASK 0xc0 - -enum pcf50633_reg_mbcc2 { - PCF50633_MBCC2_VBATCOND_2V7 = 0x00, - PCF50633_MBCC2_VBATCOND_2V85 = 0x01, - PCF50633_MBCC2_VBATCOND_3V0 = 0x02, - PCF50633_MBCC2_VBATCOND_3V15 = 0x03, - PCF50633_MBCC2_VMAX_4V = 0x00, - PCF50633_MBCC2_VMAX_4V20 = 0x28, - PCF50633_MBCC2_VRESDEBTIME_64S = 0x80, /* debounce time (32/64sec) */ -}; - -enum pcf50633_reg_mbcc7 { - PCF50633_MBCC7_USB_100mA = 0x00, - PCF50633_MBCC7_USB_500mA = 0x01, - PCF50633_MBCC7_USB_1000mA = 0x02, - PCF50633_MBCC7_USB_SUSPEND = 0x03, - PCF50633_MBCC7_BATTEMP_EN = 0x04, - PCF50633_MBCC7_BATSYSIMAX_1A6 = 0x00, - PCF50633_MBCC7_BATSYSIMAX_1A8 = 0x40, - PCF50633_MBCC7_BATSYSIMAX_2A0 = 0x80, - PCF50633_MBCC7_BATSYSIMAX_2A2 = 0xc0, -}; -#define PCF50633_MBCC7_USB_MASK 0x03 - -enum pcf50633_reg_mbcc8 { - PCF50633_MBCC8_USBENASUS = 0x10, -}; - -enum pcf50633_reg_mbcs1 { - PCF50633_MBCS1_USBPRES = 0x01, - PCF50633_MBCS1_USBOK = 0x02, - PCF50633_MBCS1_ADAPTPRES = 0x04, - PCF50633_MBCS1_ADAPTOK = 0x08, - PCF50633_MBCS1_TBAT_OK = 0x00, - PCF50633_MBCS1_TBAT_ABOVE = 0x10, - PCF50633_MBCS1_TBAT_BELOW = 0x20, - PCF50633_MBCS1_TBAT_UNDEF = 0x30, - PCF50633_MBCS1_PREWDTEXP = 0x40, - PCF50633_MBCS1_WDTEXP = 0x80, -}; - -enum pcf50633_reg_mbcs2_mbcmod { - PCF50633_MBCS2_MBC_PLAY = 0x00, - PCF50633_MBCS2_MBC_USB_PRE = 0x01, - PCF50633_MBCS2_MBC_USB_PRE_WAIT = 0x02, - PCF50633_MBCS2_MBC_USB_FAST = 0x03, - PCF50633_MBCS2_MBC_USB_FAST_WAIT = 0x04, - PCF50633_MBCS2_MBC_USB_SUSPEND = 0x05, - PCF50633_MBCS2_MBC_ADP_PRE = 0x06, - PCF50633_MBCS2_MBC_ADP_PRE_WAIT = 0x07, - PCF50633_MBCS2_MBC_ADP_FAST = 0x08, - PCF50633_MBCS2_MBC_ADP_FAST_WAIT = 0x09, - PCF50633_MBCS2_MBC_BAT_FULL = 0x0a, - PCF50633_MBCS2_MBC_HALT = 0x0b, -}; -#define PCF50633_MBCS2_MBC_MASK 0x0f -enum pcf50633_reg_mbcs2_chgstat { - PCF50633_MBCS2_CHGS_NONE = 0x00, - PCF50633_MBCS2_CHGS_ADAPTER = 0x10, - PCF50633_MBCS2_CHGS_USB = 0x20, - PCF50633_MBCS2_CHGS_BOTH = 0x30, -}; -#define PCF50633_MBCS2_RESSTAT_AUTO 0x40 - -enum pcf50633_reg_mbcs3 { - PCF50633_MBCS3_USBLIM_PLAY = 0x01, - PCF50633_MBCS3_USBLIM_CGH = 0x02, - PCF50633_MBCS3_TLIM_PLAY = 0x04, - PCF50633_MBCS3_TLIM_CHG = 0x08, - PCF50633_MBCS3_ILIM = 0x10, /* 1: Ibat > Icutoff */ - PCF50633_MBCS3_VLIM = 0x20, /* 1: Vbat == Vmax */ - PCF50633_MBCS3_VBATSTAT = 0x40, /* 1: Vbat > Vbatcond */ - PCF50633_MBCS3_VRES = 0x80, /* 1: Vbat > Vth(RES) */ -}; - -#define PCF50633_MBCC2_VBATCOND_MASK 0x03 -#define PCF50633_MBCC2_VMAX_MASK 0x3c - -/* Charger status */ -#define PCF50633_MBC_USB_ONLINE 0x01 -#define PCF50633_MBC_USB_ACTIVE 0x02 -#define PCF50633_MBC_ADAPTER_ONLINE 0x04 -#define PCF50633_MBC_ADAPTER_ACTIVE 0x08 - -int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma); - -int pcf50633_mbc_get_status(struct pcf50633 *); -int pcf50633_mbc_get_usb_online_status(struct pcf50633 *); - -#endif - diff --git a/include/linux/mfd/pcf50633/pmic.h b/include/linux/mfd/pcf50633/pmic.h deleted file mode 100644 index eac0c3d8e984..000000000000 --- a/include/linux/mfd/pcf50633/pmic.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_MFD_PCF50633_PMIC_H -#define __LINUX_MFD_PCF50633_PMIC_H - -#include -#include - -#define PCF50633_REG_AUTOOUT 0x1a -#define PCF50633_REG_AUTOENA 0x1b -#define PCF50633_REG_AUTOCTL 0x1c -#define PCF50633_REG_AUTOMXC 0x1d -#define PCF50633_REG_DOWN1OUT 0x1e -#define PCF50633_REG_DOWN1ENA 0x1f -#define PCF50633_REG_DOWN1CTL 0x20 -#define PCF50633_REG_DOWN1MXC 0x21 -#define PCF50633_REG_DOWN2OUT 0x22 -#define PCF50633_REG_DOWN2ENA 0x23 -#define PCF50633_REG_DOWN2CTL 0x24 -#define PCF50633_REG_DOWN2MXC 0x25 -#define PCF50633_REG_MEMLDOOUT 0x26 -#define PCF50633_REG_MEMLDOENA 0x27 -#define PCF50633_REG_LDO1OUT 0x2d -#define PCF50633_REG_LDO1ENA 0x2e -#define PCF50633_REG_LDO2OUT 0x2f -#define PCF50633_REG_LDO2ENA 0x30 -#define PCF50633_REG_LDO3OUT 0x31 -#define PCF50633_REG_LDO3ENA 0x32 -#define PCF50633_REG_LDO4OUT 0x33 -#define PCF50633_REG_LDO4ENA 0x34 -#define PCF50633_REG_LDO5OUT 0x35 -#define PCF50633_REG_LDO5ENA 0x36 -#define PCF50633_REG_LDO6OUT 0x37 -#define PCF50633_REG_LDO6ENA 0x38 -#define PCF50633_REG_HCLDOOUT 0x39 -#define PCF50633_REG_HCLDOENA 0x3a -#define PCF50633_REG_HCLDOOVL 0x40 - -enum pcf50633_regulator_enable { - PCF50633_REGULATOR_ON = 0x01, - PCF50633_REGULATOR_ON_GPIO1 = 0x02, - PCF50633_REGULATOR_ON_GPIO2 = 0x04, - PCF50633_REGULATOR_ON_GPIO3 = 0x08, -}; -#define PCF50633_REGULATOR_ON_MASK 0x0f - -enum pcf50633_regulator_phase { - PCF50633_REGULATOR_ACTPH1 = 0x00, - PCF50633_REGULATOR_ACTPH2 = 0x10, - PCF50633_REGULATOR_ACTPH3 = 0x20, - PCF50633_REGULATOR_ACTPH4 = 0x30, -}; -#define PCF50633_REGULATOR_ACTPH_MASK 0x30 - -enum pcf50633_regulator_id { - PCF50633_REGULATOR_AUTO, - PCF50633_REGULATOR_DOWN1, - PCF50633_REGULATOR_DOWN2, - PCF50633_REGULATOR_LDO1, - PCF50633_REGULATOR_LDO2, - PCF50633_REGULATOR_LDO3, - PCF50633_REGULATOR_LDO4, - PCF50633_REGULATOR_LDO5, - PCF50633_REGULATOR_LDO6, - PCF50633_REGULATOR_HCLDO, - PCF50633_REGULATOR_MEMLDO, -}; -#endif - -- cgit v1.2.3 From 4b455f59945aab5610828a1320b045c82cbe8852 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 11 Mar 2025 15:51:40 +0800 Subject: cpu/SMT: Provide a default topology_is_primary_thread() Currently if architectures want to support HOTPLUG_SMT they need to provide a topology_is_primary_thread() telling the framework which thread in the SMT cannot offline. However arm64 doesn't have a restriction on which thread in the SMT cannot offline, a simplest choice is that just make 1st thread as the "primary" thread. So just make this as the default implementation in the framework and let architectures like x86 that have special primary thread to override this function (which they've already done). There's no need to provide a stub function if !CONFIG_SMP or !CONFIG_HOTPLUG_SMT. In such case the testing CPU is already the 1st CPU in the SMT so it's always the primary thread. Reviewed-by: Jonathan Cameron Reviewed-by: Pierre Gondois Reviewed-by: Dietmar Eggemann Signed-off-by: Yicong Yang Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250311075143.61078-2-yangyicong@huawei.com Signed-off-by: Catalin Marinas --- include/linux/topology.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 52f5850730b3..f8bb02c5e8a4 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -240,6 +240,29 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif +#ifndef topology_is_primary_thread + +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + /* + * When disabling SMT, the primary thread of the SMT will remain + * enabled/active. Architectures that have a special primary thread + * (e.g. x86) need to override this function. Otherwise the first + * thread in the SMT can be made the primary thread. + * + * The sibling cpumask of an offline CPU always contains the CPU + * itself on architectures using the implementation of + * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology. + * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for + * building their topology have to check whether to use this default + * implementation or to override it. + */ + return cpu == cpumask_first(topology_sibling_cpumask(cpu)); +} +#define topology_is_primary_thread topology_is_primary_thread + +#endif + static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); -- cgit v1.2.3 From 87886b32d669abc11c7be95ef44099215e4f5788 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 12 Feb 2025 11:36:18 +0100 Subject: lockdep: Don't disable interrupts on RT in disable_irq_nosync_lockdep.*() disable_irq_nosync_lockdep() disables interrupts with lockdep enabled to avoid false positive reports by lockdep that a certain lock has not been acquired with disabled interrupts. The user of this macros expects that a lock can be acquried without disabling interrupts because the IRQ line triggering the interrupt is disabled. This triggers a warning on PREEMPT_RT because after disable_irq_nosync_lockdep.*() the following spinlock_t now is acquired with disabled interrupts. On PREEMPT_RT there is no difference between spin_lock() and spin_lock_irq() so avoiding disabling interrupts in this case works for the two remaining callers as of today. Don't disable interrupts on PREEMPT_RT in disable_irq_nosync_lockdep.*(). Closes: https://lore.kernel.org/760e34f9-6034-40e0-82a5-ee9becd24438@roeck-us.net Fixes: e8106b941ceab ("[PATCH] lockdep: core, add enable/disable_irq_irqsave/irqrestore() APIs") Reported-by: Guenter Roeck Suggested-by: "Steven Rostedt (Google)" Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20250212103619.2560503-2-bigeasy@linutronix.de --- include/linux/interrupt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 8cd9327e4e78..a1b1be9bf73b 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -448,7 +448,7 @@ irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, static inline void disable_irq_nosync_lockdep(unsigned int irq) { disable_irq_nosync(irq); -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_disable(); #endif } @@ -456,7 +456,7 @@ static inline void disable_irq_nosync_lockdep(unsigned int irq) static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags) { disable_irq_nosync(irq); -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_save(*flags); #endif } @@ -471,7 +471,7 @@ static inline void disable_irq_lockdep(unsigned int irq) static inline void enable_irq_lockdep(unsigned int irq) { -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_enable(); #endif enable_irq(irq); @@ -479,7 +479,7 @@ static inline void enable_irq_lockdep(unsigned int irq) static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags) { -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_restore(*flags); #endif enable_irq(irq); -- cgit v1.2.3 From 35e6b537af85d97e0aafd8f2829dfa884a22df20 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 12 Feb 2025 11:36:19 +0100 Subject: lockdep: Remove disable_irq_lockdep() disable_irq_lockdep() has no users, last one was probabaly removed in 0b7c874348ea1 ("forcedeth: fix unilateral interrupt disabling in netpoll path") Remove disable_irq_lockdep(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20250212103619.2560503-3-bigeasy@linutronix.de --- include/linux/interrupt.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a1b1be9bf73b..c782a74d2a30 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -461,14 +461,6 @@ static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned #endif } -static inline void disable_irq_lockdep(unsigned int irq) -{ - disable_irq(irq); -#ifdef CONFIG_LOCKDEP - local_irq_disable(); -#endif -} - static inline void enable_irq_lockdep(unsigned int irq) { #if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) -- cgit v1.2.3 From 590f5d6752f7d951d3549b259c1436940131703b Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 12 Mar 2025 16:29:46 -0500 Subject: of: Move of_prop_val_eq() next to the single user There's only a single user of of_prop_val_eq(), so move it to overlay.c. This removes one case of exposing struct property outside of the DT code. Signed-off-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250312212947.1067337-1-robh@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 86bf8f073111..e95a02db321a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -908,12 +908,6 @@ static inline const void *of_device_get_match_data(const struct device *dev) #define of_node_cmp(s1, s2) strcasecmp((s1), (s2)) #endif -static inline int of_prop_val_eq(const struct property *p1, const struct property *p2) -{ - return p1->length == p2->length && - !memcmp(p1->value, p2->value, (size_t)p1->length); -} - #define for_each_property_of_node(dn, pp) \ for (pp = dn->properties; pp != NULL; pp = pp->next) -- cgit v1.2.3 From 20238d49448cdb406da2b9bd3e50f892b26da318 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 29 Sep 2024 14:21:48 +0100 Subject: async_xor: Remove unused 'async_xor_val' async_xor_val has been unused since commit a7c224a820c3 ("md/raid5: convert to new xor compution interface") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Herbert Xu --- include/linux/async_tx.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 5cc73d7e5b52..1ca9f9e05f4f 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -167,11 +167,6 @@ async_xor_offs(struct page *dest, unsigned int offset, struct page **src_list, unsigned int *src_offset, int src_cnt, size_t len, struct async_submit_ctl *submit); -struct dma_async_tx_descriptor * -async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, - int src_cnt, size_t len, enum sum_check_flags *result, - struct async_submit_ctl *submit); - struct dma_async_tx_descriptor * async_xor_val_offs(struct page *dest, unsigned int offset, struct page **src_list, unsigned int *src_offset, -- cgit v1.2.3 From f90b474a35744b5d43009e4fab232e74a3024cae Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 10 Mar 2025 13:40:17 +0100 Subject: mm: Fix the flipped condition in gfpflags_allow_spinning() The function gfpflags_allow_spinning() has a bug that makes it return the opposite result than intended. This could contribute to deadlocks as usage profilerates, for now it was noticed as a performance regression due to try_charge_memcg() not refilling memcg stock when it could. Fix the flipped condition. Fixes: 97769a53f117 ("mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation") Reported-by: kernel test robot Acked-by: Shakeel Butt Signed-off-by: Vlastimil Babka Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250310124017.187-1-alexei.starovoitov@gmail.com Closes: https://lore.kernel.org/oe-lkp/202503101254.cfd454df-lkp@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ceb226c2e25c..c9fa6309c903 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -55,7 +55,7 @@ static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) * regular page allocator doesn't fully support this * allocation mode. */ - return !(gfp_flags & __GFP_RECLAIM); + return !!(gfp_flags & __GFP_RECLAIM); } #ifdef CONFIG_HIGHMEM -- cgit v1.2.3 From 4b82b181a26cff8bf7adc3a85a88d121d92edeaf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 24 Feb 2025 15:01:16 -0800 Subject: bpf: Allow pre-ordering for bpf cgroup progs Currently for bpf progs in a cgroup hierarchy, the effective prog array is computed from bottom cgroup to upper cgroups (post-ordering). For example, the following cgroup hierarchy root cgroup: p1, p2 subcgroup: p3, p4 have BPF_F_ALLOW_MULTI for both cgroup levels. The effective cgroup array ordering looks like p3 p4 p1 p2 and at run time, progs will execute based on that order. But in some cases, it is desirable to have root prog executes earlier than children progs (pre-ordering). For example, - prog p1 intends to collect original pkt dest addresses. - prog p3 will modify original pkt dest addresses to a proxy address for security reason. The end result is that prog p1 gets proxy address which is not what it wants. Putting p1 to every child cgroup is not desirable either as it will duplicate itself in many child cgroups. And this is exactly a use case we are encountering in Meta. To fix this issue, let us introduce a flag BPF_F_PREORDER. If the flag is specified at attachment time, the prog has higher priority and the ordering with that flag will be from top to bottom (pre-ordering). For example, in the above example, root cgroup: p1, p2 subcgroup: p3, p4 Let us say p2 and p4 are marked with BPF_F_PREORDER. The final effective array ordering will be p2 p4 p3 p1 Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250224230116.283071-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7fc69083e745..9de7adb68294 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -111,6 +111,7 @@ struct bpf_prog_list { struct bpf_prog *prog; struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + u32 flags; }; int cgroup_bpf_inherit(struct cgroup *cgrp); -- cgit v1.2.3 From e2d8f560d178970c345a6271330012b0977d9093 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 1 Mar 2025 07:18:44 -0800 Subject: bpf: Summarize sleepable global subprogs The verifier currently does not permit global subprog calls when a lock is held, preemption is disabled, or when IRQs are disabled. This is because we don't know whether the global subprog calls sleepable functions or not. In case of locks, there's an additional reason: functions called by the global subprog may hold additional locks etc. The verifier won't know while verifying the global subprog whether it was called in context where a spin lock is already held by the program. Perform summarization of the sleepable nature of a global subprog just like changes_pkt_data and then allow calls to global subprogs for non-sleepable ones from atomic context. While making this change, I noticed that RCU read sections had no protection against sleepable global subprog calls, include it in the checks and fix this while we're at it. Care needs to be taken to not allow global subprog calls when regular bpf_spin_lock is held. When resilient spin locks is held, we want to potentially have this check relaxed, but not for now. Also make sure extensions freplacing global functions cannot do so in case the target is non-sleepable, but the extension is. The other combination is ok. Tests are included in the next patch to handle all special conditions. Fixes: 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250301151846.1552362-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 15164787ce7f..1ee715983619 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1531,6 +1531,7 @@ struct bpf_prog_aux { bool jits_use_priv_stack; bool priv_stack_requested; bool changes_pkt_data; + bool might_sleep; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bbd013c38ff9..d338f2a96bba 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -667,6 +667,7 @@ struct bpf_subprog_info { /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; + bool might_sleep: 1; enum priv_stack_mode priv_stack_mode; u8 arg_cnt; -- cgit v1.2.3 From e723608bf428014b15d9904e062f44f5fe473ad6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 3 Mar 2025 16:32:38 -0800 Subject: bpf: Add verifier support for timed may_goto Implement support in the verifier for replacing may_goto implementation from a counter-based approach to one which samples time on the local CPU to have a bigger loop bound. We implement it by maintaining 16-bytes per-stack frame, and using 8 bytes for maintaining the count for amortizing time sampling, and 8 bytes for the starting timestamp. To minimize overhead, we need to avoid spilling and filling of registers around this sequence, so we push this cost into the time sampling function 'arch_bpf_timed_may_goto'. This is a JIT-specific wrapper around bpf_check_timed_may_goto which returns us the count to store into the stack through BPF_REG_AX. All caller-saved registers (r0-r5) are guaranteed to remain untouched. The loop can be broken by returning count as 0, otherwise we dispatch into the function when the count drops to 0, and the runtime chooses to refresh it (by returning count as BPF_MAX_TIMED_LOOPS) or returning 0 and aborting the loop on next iteration. Since the check for 0 is done right after loading the count from the stack, all subsequent cond_break sequences should immediately break as well, of the same loop or subsequent loops in the program. We pass in the stack_depth of the count (and thus the timestamp, by adding 8 to it) to the arch_bpf_timed_may_goto call so that it can be passed in to bpf_check_timed_may_goto as an argument after r1 is saved, by adding the offset to r10/fp. This adjustment will be arch specific, and the next patch will introduce support for x86. Note that depending on loop complexity, time spent in the loop can be more than the current limit (250 ms), but imposing an upper bound on program runtime is an orthogonal problem which will be addressed when program cancellations are supported. The current time afforded by cond_break may not be enough for cases where BPF programs want to implement locking algorithms inline, and use cond_break as a promise to the verifier that they will eventually terminate. Below are some benchmarking numbers on the time taken per-iteration for an empty loop that counts the number of iterations until cond_break fires. For comparison, we compare it against bpf_for/bpf_repeat which is another way to achieve the same number of spins (BPF_MAX_LOOPS). The hardware used for benchmarking was a Sapphire Rapids Intel server with performance governor enabled, mitigations were enabled. +-----------------------------+--------------+--------------+------------------+ | Loop type | Iterations | Time (ms) | Time/iter (ns) | +-----------------------------|--------------+--------------+------------------+ | may_goto | 8388608 | 3 | 0.36 | | timed_may_goto (count=65535)| 589674932 | 250 | 0.42 | | bpf_for | 8388608 | 10 | 1.19 | +-----------------------------+--------------+--------------+------------------+ This gives a good approximation at low overhead while staying close to the current implementation. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250304003239.2390751-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/filter.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1ee715983619..6af8b2db8b15 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1987,6 +1987,7 @@ struct bpf_array { */ enum { BPF_MAX_LOOPS = 8 * 1024 * 1024, + BPF_MAX_TIMED_LOOPS = 0xffff, }; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 3ed6eb9e7c73..02dda5c53d91 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -669,6 +669,11 @@ struct bpf_prog_stats { struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); +struct bpf_timed_may_goto { + u64 count; + u64 timestamp; +}; + struct sk_filter { refcount_t refcnt; struct rcu_head rcu; @@ -1130,8 +1135,11 @@ bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); +bool bpf_jit_supports_timed_may_goto(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); +u64 arch_bpf_timed_may_goto(void); +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *); bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); static inline bool bpf_dump_raw_ok(const struct cred *cred) -- cgit v1.2.3 From 880442305a3908589bf4d6fc1d79edb577ee497c Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 4 Mar 2025 01:06:13 +0000 Subject: bpf: Introduce load-acquire and store-release instructions Introduce BPF instructions with load-acquire and store-release semantics, as discussed in [1]. Define 2 new flags: #define BPF_LOAD_ACQ 0x100 #define BPF_STORE_REL 0x110 A "load-acquire" is a BPF_STX | BPF_ATOMIC instruction with the 'imm' field set to BPF_LOAD_ACQ (0x100). Similarly, a "store-release" is a BPF_STX | BPF_ATOMIC instruction with the 'imm' field set to BPF_STORE_REL (0x110). Unlike existing atomic read-modify-write operations that only support BPF_W (32-bit) and BPF_DW (64-bit) size modifiers, load-acquires and store-releases also support BPF_B (8-bit) and BPF_H (16-bit). As an exception, however, 64-bit load-acquires/store-releases are not supported on 32-bit architectures (to fix a build error reported by the kernel test robot). An 8- or 16-bit load-acquire zero-extends the value before writing it to a 32-bit register, just like ARM64 instruction LDARH and friends. Similar to existing atomic read-modify-write operations, misaligned load-acquires/store-releases are not allowed (even if BPF_F_ANY_ALIGNMENT is set). As an example, consider the following 64-bit load-acquire BPF instruction (assuming little-endian): db 10 00 00 00 01 00 00 r0 = load_acquire((u64 *)(r1 + 0x0)) opcode (0xdb): BPF_ATOMIC | BPF_DW | BPF_STX imm (0x00000100): BPF_LOAD_ACQ Similarly, a 16-bit BPF store-release: cb 21 00 00 10 01 00 00 store_release((u16 *)(r1 + 0x0), w2) opcode (0xcb): BPF_ATOMIC | BPF_H | BPF_STX imm (0x00000110): BPF_STORE_REL In arch/{arm64,s390,x86}/net/bpf_jit_comp.c, have bpf_jit_supports_insn(..., /*in_arena=*/true) return false for the new instructions, until the corresponding JIT compiler supports them in arena. [1] https://lore.kernel.org/all/20240729183246.4110549-1-yepeilin@google.com/ Acked-by: Eduard Zingerman Acked-by: Ilya Leoshkevich Cc: kernel test robot Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/a217f46f0e445fbd573a1a024be5c6bf1d5fe716.1741049567.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 +++++++++++++++ include/linux/filter.h | 2 ++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6af8b2db8b15..0d7b70124d81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -991,6 +991,21 @@ static inline bool bpf_pseudo_func(const struct bpf_insn *insn) return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; } +/* Given a BPF_ATOMIC instruction @atomic_insn, return true if it is an + * atomic load or store, and false if it is a read-modify-write instruction. + */ +static inline bool +bpf_atomic_is_load_store(const struct bpf_insn *atomic_insn) +{ + switch (atomic_insn->imm) { + case BPF_LOAD_ACQ: + case BPF_STORE_REL: + return true; + default: + return false; + } +} + struct bpf_prog_ops { int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); diff --git a/include/linux/filter.h b/include/linux/filter.h index 02dda5c53d91..590476743f7a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -364,6 +364,8 @@ static inline bool insn_is_cast_user(const struct bpf_insn *insn) * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) + * BPF_LOAD_ACQ dst_reg = smp_load_acquire(src_reg + off16) + * BPF_STORE_REL smp_store_release(dst_reg + off16, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ -- cgit v1.2.3 From 14c8552db64476ffc27c13dc6652fc0dac31c0ba Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 4 Mar 2025 11:50:22 -0800 Subject: bpf: simple DFA-based live registers analysis Compute may-live registers before each instruction in the program. The register is live before the instruction I if it is read by I or some instruction S following I during program execution and is not overwritten between I and S. This information would be used in the next patch as a hint in func_states_equal(). Use a simple algorithm described in [1] to compute this information: - define the following: - I.use : a set of all registers read by instruction I; - I.def : a set of all registers written by instruction I; - I.in : a set of all registers that may be alive before I execution; - I.out : a set of all registers that may be alive after I execution; - I.successors : a set of instructions S that might immediately follow I for some program execution; - associate separate empty sets 'I.in' and 'I.out' with each instruction; - visit each instruction in a postorder and update corresponding 'I.in' and 'I.out' sets as follows: I.out = U [S.in for S in I.successors] I.in = (I.out / I.def) U I.use (where U stands for set union, / stands for set difference) - repeat the computation while I.{in,out} changes for any instruction. On implementation side keep things as simple, as possible: - check_cfg() already marks instructions EXPLORED in post-order, modify it to save the index of each EXPLORED instruction in a vector; - represent I.{in,out,use,def} as bitmasks; - don't split the program into basic blocks and don't maintain the work queue, instead: - do fixed-point computation by visiting each instruction; - maintain a simple 'changed' flag if I.{in,out} for any instruction change; Measurements show that even such simplistic implementation does not add measurable verification time overhead (for selftests, at-least). Note on check_cfg() ex_insn_beg/ex_done change: To avoid out of bounds access to env->cfg.insn_postorder array, it should be guaranteed that instruction transitions to EXPLORED state only once. Previously this was not the fact for incorrect programs with direct calls to exception callbacks. The 'align' selftest needs adjustment to skip computed insn/live registers printout. Otherwise it matches lines from the live registers printout. [1] https://en.wikipedia.org/wiki/Live-variable_analysis Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250304195024.2478889-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d338f2a96bba..d6cfc4ee6820 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -591,6 +591,8 @@ struct bpf_insn_aux_data { * accepts callback function as a parameter. */ bool calls_callback; + /* registers alive before this instruction. */ + u16 live_regs_before; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ @@ -748,7 +750,11 @@ struct bpf_verifier_env { struct { int *insn_state; int *insn_stack; + /* vector of instruction indexes sorted in post-order */ + int *insn_postorder; int cur_stack; + /* current position in the insn_postorder vector */ + int cur_postorder; } cfg; struct backtrack_state bt; struct bpf_insn_hist_entry *insn_hist; -- cgit v1.2.3 From 082f1db02c8034fee787ea9809775ea861c50430 Mon Sep 17 00:00:00 2001 From: Blaise Boscaccy Date: Mon, 10 Mar 2025 15:17:11 -0700 Subject: security: Propagate caller information in bpf hooks Certain bpf syscall subcommands are available for usage from both userspace and the kernel. LSM modules or eBPF gatekeeper programs may need to take a different course of action depending on whether or not a BPF syscall originated from the kernel or userspace. Additionally, some of the bpf_attr struct fields contain pointers to arbitrary memory. Currently the functionality to determine whether or not a pointer refers to kernel memory or userspace memory is exposed to the bpf verifier, but that information is missing from various LSM hooks. Here we augment the LSM hooks to provide this data, by simply passing a boolean flag indicating whether or not the call originated in the kernel, in any hook that contains a bpf_attr struct that corresponds to a subcommand that may be called from the kernel. Signed-off-by: Blaise Boscaccy Acked-by: Song Liu Acked-by: Paul Moore Link: https://lore.kernel.org/r/20250310221737.821889-2-bboscaccy@linux.microsoft.com Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 6 +++--- include/linux/security.h | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index e2f1ce37c41e..f5aafd3ba5d4 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -426,14 +426,14 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL -LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) +LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, const struct path *path) diff --git a/include/linux/security.h b/include/linux/security.h index 980b6c207cad..b2010034f82b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2249,14 +2249,14 @@ struct bpf_map; struct bpf_prog; struct bpf_token; #ifdef CONFIG_SECURITY -extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); +extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token); + struct bpf_token *token, bool kernel); extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token); + struct bpf_token *token, bool kernel); extern void security_bpf_prog_free(struct bpf_prog *prog); extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path); @@ -2265,7 +2265,7 @@ extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cm extern int security_bpf_token_capable(const struct bpf_token *token, int cap); #else static inline int security_bpf(int cmd, union bpf_attr *attr, - unsigned int size) + unsigned int size, bool kernel) { return 0; } @@ -2281,7 +2281,7 @@ static inline int security_bpf_prog(struct bpf_prog *prog) } static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { return 0; } @@ -2290,7 +2290,7 @@ static inline void security_bpf_map_free(struct bpf_map *map) { } static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { return 0; } -- cgit v1.2.3 From 654b33ada4ab5e926cd9c570196fefa7bec7c1df Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 1 Mar 2025 15:06:24 +0300 Subject: proc: fix UAF in proc_get_inode() Fix race between rmmod and /proc/XXX's inode instantiation. The bug is that pde->proc_ops don't belong to /proc, it belongs to a module, therefore dereferencing it after /proc entry has been registered is a bug unless use_pde/unuse_pde() pair has been used. use_pde/unuse_pde can be avoided (2 atomic ops!) because pde->proc_ops never changes so information necessary for inode instantiation can be saved _before_ proc_register() in PDE itself and used later, avoiding pde->proc_ops->... dereference. rmmod lookup sys_delete_module proc_lookup_de pde_get(de); proc_get_inode(dir->i_sb, de); mod->exit() proc_remove remove_proc_subtree proc_entry_rundown(de); free_module(mod); if (S_ISREG(inode->i_mode)) if (de->proc_ops->proc_read_iter) --> As module is already freed, will trigger UAF BUG: unable to handle page fault for address: fffffbfff80a702b PGD 817fc4067 P4D 817fc4067 PUD 817fc0067 PMD 102ef4067 PTE 0 Oops: Oops: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 26 UID: 0 PID: 2667 Comm: ls Tainted: G Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:proc_get_inode+0x302/0x6e0 RSP: 0018:ffff88811c837998 EFLAGS: 00010a06 RAX: dffffc0000000000 RBX: ffffffffc0538140 RCX: 0000000000000007 RDX: 1ffffffff80a702b RSI: 0000000000000001 RDI: ffffffffc0538158 RBP: ffff8881299a6000 R08: 0000000067bbe1e5 R09: 1ffff11023906f20 R10: ffffffffb560ca07 R11: ffffffffb2b43a58 R12: ffff888105bb78f0 R13: ffff888100518048 R14: ffff8881299a6004 R15: 0000000000000001 FS: 00007f95b9686840(0000) GS:ffff8883af100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffbfff80a702b CR3: 0000000117dd2000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: proc_lookup_de+0x11f/0x2e0 __lookup_slow+0x188/0x350 walk_component+0x2ab/0x4f0 path_lookupat+0x120/0x660 filename_lookup+0x1ce/0x560 vfs_statx+0xac/0x150 __do_sys_newstat+0x96/0x110 do_syscall_64+0x5f/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e [adobriyan@gmail.com: don't do 2 atomic ops on the common path] Link: https://lkml.kernel.org/r/3d25ded0-1739-447e-812b-e34da7990dcf@p183 Fixes: 778f3dd5a13c ("Fix procfs compat_ioctl regression") Signed-off-by: Ye Bin Signed-off-by: Alexey Dobriyan Cc: Al Viro Cc: David S. Miller Cc: Signed-off-by: Andrew Morton --- include/linux/proc_fs.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 0b2a89854440..ea62201c74c4 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -20,10 +20,13 @@ enum { * If in doubt, ignore this flag. */ #ifdef MODULE - PROC_ENTRY_PERMANENT = 0U, + PROC_ENTRY_PERMANENT = 0U, #else - PROC_ENTRY_PERMANENT = 1U << 0, + PROC_ENTRY_PERMANENT = 1U << 0, #endif + + PROC_ENTRY_proc_read_iter = 1U << 1, + PROC_ENTRY_proc_compat_ioctl = 1U << 2, }; struct proc_ops { -- cgit v1.2.3 From 39a326e6daba5703a8e4de17cacc7ddb0dc5b07f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 28 Feb 2025 09:53:36 -0800 Subject: mm/damon: respect core layer filters' allowance decision on ops layer Filtering decisions are made in filters evaluation order. Once a decision is made by a filter, filters that scheduled to be evaluated after the decision-made filter should just respect it. This is the intended and documented behavior. Since core layer-handled filters are evaluated before operations layer-handled filters, decisions made on core layer should respected by ops layer. In case of reject filters, the decision is respected, since core layer-rejected regions are not passed to ops layer. But in case of allow filters, ops layer filters don't know if the region has passed to them because it was allowed by core filters or just because it didn't match to any core layer. The current wrong implementation assumes it was due to not matched by any core filters. As a reuslt, the decision is not respected. Pass the missing information to ops layer using a new filed in 'struct damos', and make the ops layer filters respect it. Link: https://lkml.kernel.org/r/20250228175336.42781-1-sj@kernel.org Fixes: 491fee286e56 ("mm/damon/core: support damos_filter->allow") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index af525252b853..c9074d569596 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -470,6 +470,11 @@ struct damos { unsigned long next_apply_sis; /* informs if ongoing DAMOS walk for this scheme is finished */ bool walk_completed; + /* + * If the current region in the filtering stage is allowed by core + * layer-handled filters. If true, operations layer allows it, too. + */ + bool core_filters_allowed; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; -- cgit v1.2.3 From 73f839b6d2ed75f281bd75aeb68e81bce373bdee Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 6 Mar 2025 10:31:33 +0800 Subject: mm: memcontrol: fix swap counter leak from offline cgroup Commit 6769183166b3 removed the parameter of id from swap_cgroup_record() and get the memcg id from mem_cgroup_id(folio_memcg(folio)). However, the caller of it may update a different memcg's counter instead of folio_memcg(folio). E.g. in the caller of mem_cgroup_swapout(), @swap_memcg could be different with @memcg and update the counter of @swap_memcg, but swap_cgroup_record() records the wrong memcg's ID. When it is uncharged from __mem_cgroup_uncharge_swap(), the swap counter will leak since the wrong recorded ID. Fix it by bringing the parameter of id back. Link: https://lkml.kernel.org/r/20250306023133.44838-1-songmuchun@bytedance.com Fixes: 6769183166b3 ("mm/swap_cgroup: decouple swap cgroup recording and clearing") Signed-off-by: Muchun Song Reviewed-by: Kairui Song Cc: Chris Li Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/swap_cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index b5ec038069da..91cdf12190a0 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,7 +6,7 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent); +extern void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent); extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); @@ -15,7 +15,7 @@ extern void swap_cgroup_swapoff(int type); #else static inline -void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent) { } -- cgit v1.2.3 From b9c0e49abfca06f1a109acea834bcfc934f33f76 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Mar 2025 14:35:24 +0000 Subject: mm: decline to manipulate the refcount on a slab page Slab pages now have a refcount of 0, so nobody should be trying to manipulate the refcount on them. Doing so has little effect; the object could be freed and reallocated to a different purpose, although the slab itself would not be until the refcount was put making it behave rather like TYPESAFE_BY_RCU. Unfortunately, __iov_iter_get_pages_alloc() does take a refcount. Fix that to not change the refcount, and make put_page() silently not change the refcount. get_page() warns so that we can fix any other callers that need to be changed. Long-term, networking needs to stop taking a refcount on the pages that it uses and rely on the caller to hold whatever references are necessary to make the memory stable. In the medium term, more page types are going to hav a zero refcount, so we'll want to move get_page() and put_page() out of line. Link: https://lkml.kernel.org/r/20250310143544.1216127-1-willy@infradead.org Fixes: 9aec2fb0fd5e (slab: allocate frozen pages) Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Hannes Reinecke Closes: https://lore.kernel.org/all/08c29e4b-2f71-4b6d-8046-27e407214d8c@suse.com/ Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..fd1e85b4b48a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1458,7 +1458,10 @@ static inline void folio_get(struct folio *folio) static inline void get_page(struct page *page) { - folio_get(page_folio(page)); + struct folio *folio = page_folio(page); + if (WARN_ON_ONCE(folio_test_slab(folio))) + return; + folio_get(folio); } static inline __must_check bool try_get_page(struct page *page) @@ -1552,6 +1555,9 @@ static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); + if (folio_test_slab(folio)) + return; + /* * For some devmap managed pages we need to catch refcount transition * from 2 to 1: -- cgit v1.2.3 From ac053946f5c40ce90ca7ccb75ed687612d9eccf9 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 27 Jan 2025 17:05:06 +0100 Subject: compiler.h: introduce TYPEOF_UNQUAL() macro Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof operator when available, to return unqualified type of the expression. Current version of sparse doesn't know anything about __typeof_unqual__() operator. Avoid the usage of __typeof_unqual__() when sparse checking is active to prevent sparse errors with unknowing keyword. Link: https://lkml.kernel.org/r/20250127160709.80604-3-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Thomas Gleixner Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Brian Gerst Cc: Denys Vlasenko Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Arnd Bergmann Cc: Boqun Feng Cc: Borislav Petkov Cc: Dave Hansen Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Kent Overstreet Cc: Nadav Amit Cc: Paolo Abeni Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 8 ++++++++ include/linux/compiler-gcc.h | 8 ++++++++ include/linux/compiler.h | 20 ++++++++++++++++++++ 3 files changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 2e7c2c282f3a..4fc8e26914ad 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -128,3 +128,11 @@ */ #define ASM_INPUT_G "ir" #define ASM_INPUT_RM "r" + +/* + * Declare compiler support for __typeof_unqual__() operator. + * + * Bindgen uses LLVM even if our C compiler is GCC, so we cannot + * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL. + */ +#define CC_HAS_TYPEOF_UNQUAL (__clang_major__ >= 19) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index c9b58188ec61..32048052c64a 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -137,3 +137,11 @@ #if GCC_VERSION < 90100 #undef __alloc_size__ #endif + +/* + * Declare compiler support for __typeof_unqual__() operator. + * + * Bindgen uses LLVM even if our C compiler is GCC, so we cannot + * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL. + */ +#define CC_HAS_TYPEOF_UNQUAL (__GNUC__ >= 14) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 155385754824..3a7a537e13a3 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -210,6 +210,26 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __must_be_cstr(p) \ __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") +/* + * Use __typeof_unqual__() when available. + * + * XXX: Remove test for __CHECKER__ once + * sparse learns about __typeof_unqual__(). + */ +#if CC_HAS_TYPEOF_UNQUAL && !defined(__CHECKER__) +# define USE_TYPEOF_UNQUAL 1 +#endif + +/* + * Define TYPEOF_UNQUAL() to use __typeof_unqual__() as typeof + * operator when available, to return an unqualified type of the exp. + */ +#if defined(USE_TYPEOF_UNQUAL) +# define TYPEOF_UNQUAL(exp) __typeof_unqual__(exp) +#else +# define TYPEOF_UNQUAL(exp) __typeof__(exp) +#endif + #endif /* __KERNEL__ */ /** -- cgit v1.2.3 From 8a3c392388c6a6e0c8937a24712b630ec9ac7016 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 27 Jan 2025 17:05:07 +0100 Subject: percpu: use TYPEOF_UNQUAL() in variable declarations Use TYPEOF_UNQUAL() to declare variables as a corresponding type without named address space qualifier to avoid "`__seg_gs' specified for auto variable `var'" errors. Link: https://lkml.kernel.org/r/20250127160709.80604-4-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Nadav Amit Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Cc: Andy Lutomirski Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Kent Overstreet Cc: Arnd Bergmann Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Peter Zijlstra Cc: Will Deacon Cc: Waiman Long Cc: Boqun Feng Cc: Linus Torvalds Cc: Brian Gerst Cc: Denys Vlasenko Signed-off-by: Andrew Morton --- include/linux/part_stat.h | 2 +- include/linux/percpu-defs.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index ac8c44dd8237..c5e9cac0575e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -33,7 +33,7 @@ struct disk_stats { #define part_stat_read(part, field) \ ({ \ - typeof((part)->bd_stats->field) res = 0; \ + TYPEOF_UNQUAL((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5b520fe86b60..79b9402404f1 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -317,7 +317,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return(stem, variable) \ ({ \ - typeof(variable) pscr_ret__; \ + TYPEOF_UNQUAL(variable) pscr_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr_ret__ = stem##1(variable); break; \ @@ -332,7 +332,7 @@ static __always_inline void __this_cpu_preempt_check(const char *op) { } #define __pcpu_size_call_return2(stem, variable, ...) \ ({ \ - typeof(variable) pscr2_ret__; \ + TYPEOF_UNQUAL(variable) pscr2_ret__; \ __verify_pcpu_ptr(&(variable)); \ switch(sizeof(variable)) { \ case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \ -- cgit v1.2.3 From 6a39fe05ecaa3946ed0af9efd3e56689d519e420 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 27 Jan 2025 17:05:08 +0100 Subject: percpu: use TYPEOF_UNQUAL() in *_cpu_ptr() accessors Use TYPEOF_UNQUAL() macro to declare the return type of *_cpu_ptr() accessors in the generic named address space to avoid access to data from pointer to non-enclosed address space type of errors. Link: https://lkml.kernel.org/r/20250127160709.80604-5-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Nadav Amit Acked-by: Christoph Lameter Cc: Dennis Zhou Cc: Tejun Heo Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Brian Gerst Cc: Peter Zijlstra Cc: Arnd Bergmann Cc: Boqun Feng Cc: "David S. Miller" Cc: Denys Vlasenko Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Kent Overstreet Cc: Paolo Abeni Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 79b9402404f1..a7cf954ea99d 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,7 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ - (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) + (TYPEOF_UNQUAL(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP -- cgit v1.2.3 From 6cea5ae714ba47ea4807d15903baca9857a450e6 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 27 Jan 2025 17:05:09 +0100 Subject: percpu: repurpose __percpu tag as a named address space qualifier The patch introduces __percpu_qual define and repurposes __percpu tag as a named address space qualifier using the new define. Arches can now conditionally define __percpu_qual as their named address space qualifier for percpu variables. Link: https://lkml.kernel.org/r/20250127160709.80604-6-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Nadav Amit Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Brian Gerst Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Boqun Feng Cc: Borislav Petkov Cc: Dave Hansen Cc: "David S. Miller" Cc: Denys Vlasenko Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Kent Overstreet Cc: Paolo Abeni Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/compiler_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 981cc3d7e3aa..5d6544545658 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -57,7 +57,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __user BTF_TYPE_TAG(user) # endif # define __iomem -# define __percpu BTF_TYPE_TAG(percpu) +# define __percpu __percpu_qual BTF_TYPE_TAG(percpu) # define __rcu BTF_TYPE_TAG(rcu) # define __chk_user_ptr(x) (void)0 -- cgit v1.2.3 From 89ce924f0bd447eb52a5f224d879dbf8f09451db Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 Jan 2025 00:41:32 -0500 Subject: mm: memcontrol: move memsw charge callbacks to v1 The interweaving of two entirely different swap accounting strategies has been one of the more confusing parts of the memcg code. Split out the v1 code to clarify the implementation and a handful of callsites, and to avoid building the v1 bits when !CONFIG_MEMCG_V1. text data bss dec hex filename 39253 6446 4160 49859 c2c3 mm/memcontrol.o.old 38877 6382 4160 49419 c10b mm/memcontrol.o Link: https://lkml.kernel.org/r/20250124054132.45643-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Balbir Singh Acked-by: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 17 +++++++++++------ include/linux/swap.h | 5 ----- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6e74b8254d9b..57664e2a8fb7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -649,8 +649,6 @@ int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); - void __mem_cgroup_uncharge(struct folio *folio); /** @@ -1165,10 +1163,6 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, return 0; } -static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr) -{ -} - static inline void mem_cgroup_uncharge(struct folio *folio) { } @@ -1848,6 +1842,9 @@ static inline void mem_cgroup_exit_user_fault(void) current->in_user_fault = 0; } +void memcg1_swapout(struct folio *folio, swp_entry_t entry); +void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages); + #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -1875,6 +1872,14 @@ static inline void mem_cgroup_exit_user_fault(void) { } +static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry) +{ +} + +static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) +{ +} + #endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index b13b72645db3..91b30701274e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -659,7 +659,6 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) #endif #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) @@ -680,10 +679,6 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct folio *folio); #else -static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) -{ -} - static inline int mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) { -- cgit v1.2.3 From 599b684a7854e51a51358fe59bbdfea281f0b461 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 10 Feb 2025 20:37:45 +0100 Subject: mm/rmap: convert make_device_exclusive_range() to make_device_exclusive() The single "real" user in the tree of make_device_exclusive_range() always requests making only a single address exclusive. The current implementation is hard to fix for properly supporting anonymous THP / large folios and for avoiding messing with rmap walks in weird ways. So let's always process a single address/page and return folio + page to minimize page -> folio lookups. This is a preparation for further changes. Reject any non-anonymous or hugetlb folios early, directly after GUP. While at it, extend the documentation of make_device_exclusive() to clarify some things. Link: https://lkml.kernel.org/r/20250210193801.781278-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Simona Vetter Reviewed-by: Alistair Popple Tested-by: Alistair Popple Cc: Alex Shi Cc: Danilo Krummrich Cc: Dave Airlie Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Jonathan Corbet Cc: Karol Herbst Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Lyude Cc: "Masami Hiramatsu (Google)" Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: SeongJae Park Cc: Vlastimil Babka Cc: Yanteng Si Cc: Barry Song Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 2 +- include/linux/rmap.h | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index e2dd57ca368b..d4e714661826 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -46,7 +46,7 @@ struct mmu_interval_notifier; * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no * longer have exclusive access to the page. When sent during creation of an * exclusive range the owner will be initialised to the value provided by the - * caller of make_device_exclusive_range(), otherwise the owner will be NULL. + * caller of make_device_exclusive(), otherwise the owner will be NULL. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 683a04088f3f..86425d42c1a9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -663,9 +663,8 @@ int folio_referenced(struct folio *, int is_locked, void try_to_migrate(struct folio *folio, enum ttu_flags flags); void try_to_unmap(struct folio *, enum ttu_flags flags); -int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct page **pages, - void *arg); +struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, + void *owner, struct folio **foliop); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) -- cgit v1.2.3 From c25465eb7630ffcadaab29c1010071512f8c9621 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 10 Feb 2025 20:37:48 +0100 Subject: mm: use single SWP_DEVICE_EXCLUSIVE entry type There is no need for the distinction anymore; let's merge the readable and writable device-exclusive entries into a single device-exclusive entry type. Link: https://lkml.kernel.org/r/20250210193801.781278-7-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Simona Vetter Reviewed-by: Alistair Popple Tested-by: Alistair Popple Cc: Alex Shi Cc: Danilo Krummrich Cc: Dave Airlie Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Jonathan Corbet Cc: Karol Herbst Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Lyude Cc: "Masami Hiramatsu (Google)" Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: SeongJae Park Cc: Vlastimil Babka Cc: Yanteng Si Cc: Barry Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 +++---- include/linux/swapops.h | 27 ++++----------------------- 2 files changed, 7 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 91b30701274e..9a48e79a0a52 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -74,14 +74,13 @@ static inline int current_is_kswapd(void) * to a special SWP_DEVICE_{READ|WRITE} entry. * * When a page is mapped by the device for exclusive access we set the CPU page - * table entries to special SWP_DEVICE_EXCLUSIVE_* entries. + * table entries to a special SWP_DEVICE_EXCLUSIVE entry. */ #ifdef CONFIG_DEVICE_PRIVATE -#define SWP_DEVICE_NUM 4 +#define SWP_DEVICE_NUM 3 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) -#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) -#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3) +#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2) #else #define SWP_DEVICE_NUM 0 #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 96f26e29fefe..64ea151a7ae3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -186,26 +186,16 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry) return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } -static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) +static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { - return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset); -} - -static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) -{ - return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset); + return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { - return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ || - swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE; + return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; } -static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE); -} #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -227,12 +217,7 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry) return false; } -static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) -{ - return swp_entry(0, 0); -} - -static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) +static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } @@ -242,10 +227,6 @@ static inline bool is_device_exclusive_entry(swp_entry_t entry) return false; } -static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) -{ - return false; -} #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION -- cgit v1.2.3 From 6df8bae8e851eacf2acf2237860213e002aba74f Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 29 Jan 2025 18:06:32 +0000 Subject: mm: zbud: remove zbud The zbud compressed pages allocator is rarely used, most users use zsmalloc. zbud consumes much more memory (only stores 1 or 2 compressed pages per physical page). The only advantage of zbud is a marginal performance improvement that by no means justify the memory overhead. Historically, zsmalloc had significantly worse latency than zbud and z3fold but offered better memory savings. This is no longer the case as shown by a simple recent analysis [1]. In a kernel build test on tmpfs in a limited cgroup, zbud 2-3% less time than zsmalloc, but at the cost of using ~32% more memory (1.5G vs 1.13G). The tradeoff does not make sense for zbud in any practical scenario. The only alleged advantage of zbud is not having the dependency on CONFIG_MMU, but CONFIG_SWAP already depends on CONFIG_MMU anyway, and zbud is only used by zswap. Remove zbud after z3fold's removal, leaving zsmalloc as the one and only zpool allocator. Leave the removal of the zpool API (and its associated config options) to a followup cleanup after no more allocators show up. Deprecating zbud for a few cycles before removing it was initially proposed [2], like z3fold was marked as deprecated for 2 cycles [3]. However, Johannes rightfully pointed out that the 2 cycles is too short for most downstream consumers, and z3fold was deprecated first only as a courtesy anyway. [1]https://lore.kernel.org/lkml/CAJD7tkbRF6od-2x_L8-A1QL3=2Ww13sCj4S3i4bNndqF+3+_Vg@mail.gmail.com/ [2]https://lore.kernel.org/lkml/Z5gdnSX5Lv-nfjQL@google.com/ [3]https://lore.kernel.org/lkml/20240904233343.933462-1-yosryahmed@google.com/ Link: https://lkml.kernel.org/r/20250129180633.3501650-3-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Alexander Gordeev Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Dan Streetman Cc: Heiko Carstens Cc: Huacai Chen Cc: Miaohe Lin Cc: Seth Jennings Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vitaly Wool Cc: Vlastimil Babka Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/zpool.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index a67d62b79698..5e6dc46b8cc4 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -4,9 +4,8 @@ * * Copyright (C) 2014 Dan Streetman * - * This is a common frontend for the zbud and zsmalloc memory - * storage pool implementations. Typically, this is used to - * store compressed memory. + * This is a common frontend for the zswap compressed memory storage + * implementations. */ #ifndef _ZPOOL_H_ -- cgit v1.2.3 From c372473a545edff2fdbc002fc67c181e17c7557b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 31 Jan 2025 12:31:53 +0000 Subject: mm: completely abstract unnecessary adj_start calculation The adj_start calculation has been a constant source of confusion in the VMA merge code. There are two cases to consider, one where we adjust the start of the vmg->middle VMA (i.e. the vmg->__adjust_middle_start merge flag is set), in which case adj_start is calculated as: (1) adj_start = vmg->end - vmg->middle->vm_start And the case where we adjust the start of the vmg->next VMA (i.e. the vmg->__adjust_next_start merge flag is set), in which case adj_start is calculated as: (2) adj_start = -(vmg->middle->vm_end - vmg->end) We apply (1) thusly: vmg->middle->vm_start = vmg->middle->vm_start + vmg->end - vmg->middle->vm_start Which simplifies to: vmg->middle->vm_start = vmg->end Similarly, we apply (2) as: vmg->next->vm_start = vmg->next->vm_start + -(vmg->middle->vm_end - vmg->end) Noting that for these VMAs to be mergeable vmg->middle->vm_end == vmg->next->vm_start and so this simplifies to: vmg->next->vm_start = vmg->next->vm_start + -(vmg->next->vm_start - vmg->end) Which simplifies to: vmg->next->vm_start = vmg->end Therefore in each case, we simply need to adjust the start of the VMA to vmg->end (!) and can do away with this adj_start calculation. The only caveat is that we must ensure we update the vm_pgoff field correctly. We therefore abstract this entire calculation to a new function vmg_adjust_set_range() which performs this calculation and sets the adjusted VMA's new range using the general vma_set_range() function. We also must update vma_adjust_trans_huge() which expects the now-abstracted adj_start parameter. It turns out this is wholly unnecessary. In vma_adjust_trans_huge() the relevant code is: if (adjust_next > 0) { struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); } The only case where this is relevant is when vmg->__adjust_middle_start is specified (in which case adj_next would have been positive), i.e. the one in which the vma specified is vmg->prev and this the sought 'next' VMA would be vmg->middle. We can therefore eliminate the find_vma() invocation altogether and simply provide the vmg->middle VMA in this instance, or NULL otherwise. Again we have an adj_next offset calculation: next->vm_start + vmg->end - vmg->middle->vm_start Where next == vmg->middle this simplifies to vmg->end as previously demonstrated. Therefore nstart is equal to vmg->end, which is already passed to vma_adjust_trans_huge() via the 'end' parameter and so this code (rather delightfully) simplifies to: if (next) split_huge_pmd_if_needed(next, end); With these changes in place, it becomes silly for commit_merge() to return vmg->target, as it is always the same and threaded through vmg, so we finally change commit_merge() to return an error value once again. This patch has no change in functional behaviour. Link: https://lkml.kernel.org/r/7bce2cd4b5afb56211822835d145471280c3dccc.1738326519.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 93e509b6c00e..e1bea54820ff 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -404,7 +404,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, - unsigned long end, long adjust_next); + unsigned long end, struct vm_area_struct *next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); @@ -571,7 +571,7 @@ static inline int madvise_collapse(struct vm_area_struct *vma, static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, - long adjust_next) + struct vm_area_struct *next) { } static inline int is_swap_pmd(pmd_t pmd) -- cgit v1.2.3 From 51ff4d7486f0c0b4110a6da4af805b179dd7b11e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Feb 2025 15:18:00 -0800 Subject: mm: avoid extra mem_alloc_profiling_enabled() checks Refactor code to avoid extra mem_alloc_profiling_enabled() checks inside pgalloc_tag_get() function which is often called after that check was already done. Link: https://lkml.kernel.org/r/20250201231803.2661189-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Cc: David Wang <00107082@163.com> Cc: Steven Rostedt Cc: Kent Overstreet Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 3469c4b20105..4a82b6b4820e 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -205,28 +205,32 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) } } -static inline struct alloc_tag *pgalloc_tag_get(struct page *page) +/* Should be called only if mem_alloc_profiling_enabled() */ +static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) { struct alloc_tag *tag = NULL; - - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_sub_check(&ref); - if (ref.ct) - tag = ct_to_alloc_tag(ref.ct); - put_page_tag_ref(handle); - } + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub_check(&ref); + if (ref.ct) + tag = ct_to_alloc_tag(ref.ct); + put_page_tag_ref(handle); } return tag; } -static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) +static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) { - if (mem_alloc_profiling_enabled() && tag) + struct alloc_tag *tag; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = __pgalloc_tag_get(page); + if (tag) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } @@ -241,8 +245,7 @@ static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } -static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} +static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} -- cgit v1.2.3 From 93d5440ece3c0aa341fb02e3a44a1b7ab44304c8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Feb 2025 15:18:02 -0800 Subject: alloc_tag: uninline code gated by mem_alloc_profiling_key in page allocator When a sizable code section is protected by a disabled static key, that code gets into the instruction cache even though it's not executed and consumes the cache, increasing cache misses. This can be remedied by moving such code into a separate uninlined function. On a Pixel6 phone, page allocation profiling overhead measured with CONFIG_MEM_ALLOC_PROFILING=y and profiling disabled is: baseline modified Big core 4.93% 1.53% Medium core 4.39% 1.41% Little core 1.02% 0.36% This improvement comes at the expense of the configuration when profiling gets enabled, since there is now an additional function call. The overhead from this additional call on Pixel6 is: Big core 0.24% Middle core 0.63% Little core 1.1% However this is negligible when compared with the overall overhead of the memory allocation profiling when it is enabled. On x86 this patch does not make noticeable difference because the overhead with mem_alloc_profiling_key disabled is much lower (under 1%) to start with, so any improvement is less visible and hard to distinguish from the noise. The overhead from additional call when profiling is enabled is also within noise levels. Link: https://lkml.kernel.org/r/20250201231803.2661189-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Steven Rostedt Cc: Vlastimil Babka Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 60 ++++----------------------------------------- 1 file changed, 5 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 4a82b6b4820e..c74077977830 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -162,47 +162,13 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code } } -static inline void clear_page_tag_ref(struct page *page) -{ - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - set_codetag_empty(&ref); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} - -static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, - unsigned int nr) -{ - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} +/* Should be called only if mem_alloc_profiling_enabled() */ +void __clear_page_tag_ref(struct page *page); -static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) +static inline void clear_page_tag_ref(struct page *page) { - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_sub(&ref, PAGE_SIZE * nr); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } + if (mem_alloc_profiling_enabled()) + __clear_page_tag_ref(page); } /* Should be called only if mem_alloc_profiling_enabled() */ @@ -222,18 +188,6 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) return tag; } -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) -{ - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = __pgalloc_tag_get(page); - if (tag) - this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); -} - void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); void pgalloc_tag_swap(struct folio *new, struct folio *old); @@ -242,10 +196,6 @@ void __init alloc_tag_sec_init(void); #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void clear_page_tag_ref(struct page *page) {} -static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, - unsigned int nr) {} -static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} -- cgit v1.2.3 From 0eb7d2c337f9df3b6adbcbdce213595d94781e05 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:12 +0800 Subject: mm/swap: remove SWAP_FLAG_PRIO_SHIFT It doesn't make sense to have a zero value of shift. Remove it to avoid confusion. Link: https://lkml.kernel.org/r/20250205092721.9395-4-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 9a48e79a0a52..bbd06cbd1f2b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -24,7 +24,6 @@ struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff -#define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ -- cgit v1.2.3 From c523aa890760b004eea47a0e00b5750a528f3ced Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:19 +0800 Subject: mm/swap: rename swap_swapcount() to swap_entry_swapped() The new function name can reflect the real behaviour of the function more clearly and more accurately. And the renaming avoids the confusion between swap_swapcount() and swp_swapcount(). Link: https://lkml.kernel.org/r/20250205092721.9395-11-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index bbd06cbd1f2b..2fe91c293636 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -499,7 +499,7 @@ int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); +extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; @@ -582,9 +582,9 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { - return 0; + return false; } static inline int swp_swapcount(swp_entry_t entry) -- cgit v1.2.3 From 94ba17adaba0f651fdcf745c8891a88e2e028cfa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 7 Feb 2025 13:20:33 -0800 Subject: mm/damon: avoid applying DAMOS action to same entity multiple times 'paddr' DAMON operations set can apply a DAMOS scheme's action to a large folio multiple times in single DAMOS-regions-walk if the folio is laid on multiple DAMON regions. Add a field for DAMOS scheme object that can be used by the underlying ops to know what was the last entity that the scheme's action has applied. The core layer unsets the field when each DAMOS-regions-walk is done for the given scheme. And update 'paddr' ops to use the infrastructure to avoid the problem. Link: https://lkml.kernel.org/r/20250207212033.45269-3-sj@kernel.org Fixes: 57223ac29584 ("mm/damon/paddr: support the pageout scheme") Signed-off-by: SeongJae Park Reported-by: Usama Arif Closes: https://lore.kernel.org/20250203225604.44742-3-usamaarif642@gmail.com Cc: Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index c9074d569596..b4d37d9b9221 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -432,6 +432,7 @@ struct damos_access_pattern { * @wmarks: Watermarks for automated (in)activation of this scheme. * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. + * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. * @list: List head for siblings. * @@ -454,6 +455,15 @@ struct damos_access_pattern { * implementation could check pages of the region and skip &action to respect * &filters * + * The minimum entity that @action can be applied depends on the underlying + * &struct damon_operations. Since it may not be aligned with the core layer + * abstract, namely &struct damon_region, &struct damon_operations could apply + * @action to same entity multiple times. Large folios that underlying on + * multiple &struct damon region objects could be such examples. The &struct + * damon_operations can use @last_applied to avoid that. DAMOS core logic + * unsets @last_applied when each regions walking for applying the scheme is + * finished. + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -482,6 +492,7 @@ struct damos { int target_nid; }; struct list_head filters; + void *last_applied; struct damos_stat stat; struct list_head list; }; -- cgit v1.2.3 From a4811f53bb89b3524629b1be41add9349fe0a750 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sat, 8 Feb 2025 15:52:55 +0000 Subject: mm: provide mapping_wrprotect_range() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the fb_defio video driver, page dirty state is used to determine when frame buffer pages have been changed, allowing for batched, deferred I/O to be performed for efficiency. This implementation had only one means of doing so effectively - the use of the folio_mkclean() function. However, this use of the function is inappropriate, as the fb_defio implementation allocates kernel memory to back the framebuffer, and then is forced to specified page->index, mapping fields in order to permit the folio_mkclean() rmap traversal to proceed correctly. It is not correct to specify these fields on kernel-allocated memory, and moreover since these are not folios, page->index, mapping are deprecated fields, soon to be removed. We therefore need to provide a means by which we can correctly traverse the reverse mapping and write-protect mappings for a page backing an address_space page cache object at a given offset. This patch provides this - mapping_wrprotect_range() - which allows for this operation to be performed for a specified address_space, offset, PFN and size, without requiring a folio nor, of course, an inappropriate use of page->index, mapping. With this provided, we can subsequently adjust the fb_defio implementation to make use of this function and avoid incorrect invocation of folio_mkclean() and more importantly, incorrect manipulation of page->index and mapping fields. Link: https://lkml.kernel.org/r/e5bf969d64e7f2f2ae944d42341fc8994b736a81.1739029358.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Helge Deller Cc: Jaya Kumar Cc: Kajtar Zsolt Cc: Maíra Canal Cc: Matthew Wilcox Cc: Simona Vetter Cc: Thomas Zimemrmann Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 86425d42c1a9..69e9a431a40e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -738,6 +738,9 @@ unsigned long page_address_in_vma(const struct folio *folio, */ int folio_mkclean(struct folio *); +int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, + unsigned long pfn, unsigned long nr_pages); + int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -- cgit v1.2.3 From 6cdef2ddce2b7de8faca69061a6780080cfecc10 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sat, 8 Feb 2025 15:52:56 +0000 Subject: fb_defio: do not use deprecated page->mapping, index fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the introduction of mapping_wrprotect_range() there is no need to use folio_mkclean() in order to write-protect mappings of frame buffer pages, and therefore no need to inappropriately set kernel-allocated page->index, mapping fields to permit this operation. Instead, store the pointer to the page cache object for the mapped driver in the fb_deferred_io object, and use the already stored page offset from the pageref object to look up mappings in order to write-protect them. This is justified, as for the page objects to store a mapping pointer at the point of assignment of pages, they must all reference the same underlying address_space object. Since the life time of the pagerefs is also the lifetime of the fb_deferred_io object, storing the pointer here makes sense. This eliminates the need for all of the logic around setting and maintaining page->index,mapping which we remove. This eliminates the use of folio_mkclean() entirely but otherwise should have no functional change. [lorenzo.stoakes@oracle.com: fixup unused variable warnings] Link: https://lkml.kernel.org/r/d4018405-2762-4385-a816-e54cc23839ac@lucifer.local Link: https://lkml.kernel.org/r/81171ab16c14e3df28f6de9d14982cee528d8519.1739029358.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: Kajtar Zsolt Acked-by: Thomas Zimmermann Cc: David Hildenbrand Cc: Helge Deller Cc: Jaya Kumar Cc: Maíra Canal Cc: Matthew Wilcox Cc: Simona Vetter Signed-off-by: Andrew Morton --- include/linux/fb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 5ba187e08cf7..cd653862ab99 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -225,6 +225,7 @@ struct fb_deferred_io { int open_count; /* number of opened files; protected by fb_info lock */ struct mutex lock; /* mutex that protects the pageref list */ struct list_head pagereflist; /* list of pagerefs for touched pages */ + struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); -- cgit v1.2.3 From 6340584e489f1f8ddb65e44a7ad2ab9f3503c4d3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 11 Feb 2025 12:59:11 -0800 Subject: mm/vmstat: revert "fix a W=1 clang compiler warning" Commit 75b5ab134bb5 enabled -Wenum-enum-conversion in W=1 builds. Commit 30c2de0a267c ("mm/vmstat: fix a W=1 clang compiler warning") fixed a -Wenum-enum-conversion warning. Commit 8f6629c004b1 removed the -Wenum-enum-conversion option again from W=1 builds. Since the W=1 compiler warning fix is no longer necessary, revert it. Link: https://lkml.kernel.org/r/20250211205911.1707684-1-bvanassche@acm.org Signed-off-by: Bart Van Assche Acked-by: Vlastimil Babka Cc: Matthew Wilcox Cc: Ivan Shapovalov Cc: David Laight Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9f3a04345b86..d2761bf8ff32 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item) static inline const char *lru_list_name(enum lru_list lru) { - return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_" + return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) -- cgit v1.2.3 From 0431c42622612a96cce97cdd4cfbe7d487606cf3 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Tue, 11 Feb 2025 12:43:40 +0000 Subject: mm/damon: introduce DAMOS filter type hugepage_size Patch series "mm/damon: add support for hugepage_size DAMOS filter", v5. hugepage_size DAMOS filter can be used to gather statistics to check if memory regions of specific access tempratures are backed by hugepages of a size in a specific range. This filter can help to observe and prove the effectivenes of different schemes for shrinking/collapsing hugepages. This patch (of 4): This is to gather statistics to check if memory regions of specific access tempratures are backed by pages of a size in a specific range. This filter can help to observe and prove the effectivenes of different schemes for shrinking/collapsing hugepages. [sj@kernel.org: add kernel-doc comment for damos_filter->sz_range] Link: https://lkml.kernel.org/r/20250218223058.52459-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250211124437.278873-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20250211124437.278873-2-usamaarif642@gmail.com Signed-off-by: Usama Arif Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Johannes Weiner Cc: Usama Arif Signed-off-by: Andrew Morton --- include/linux/damon.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index b4d37d9b9221..5e7ae7bca5dc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -35,6 +35,16 @@ struct damon_addr_range { unsigned long end; }; +/** + * struct damon_size_range - Represents size for filter to operate on [@min, @max]. + * @min: Min size (inclusive). + * @max: Max size (inclusive). + */ +struct damon_size_range { + unsigned long min; + unsigned long max; +}; + /** * struct damon_region - Represents a monitoring target region. * @ar: The address range of the region. @@ -326,6 +336,7 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_YOUNG: Recently accessed pages. + * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: Page is part of a hugepage. * @DAMOS_FILTER_TYPE_ADDR: Address range. * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target. * @NR_DAMOS_FILTER_TYPES: Number of filter types. @@ -345,6 +356,7 @@ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_YOUNG, + DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, DAMOS_FILTER_TYPE_ADDR, DAMOS_FILTER_TYPE_TARGET, NR_DAMOS_FILTER_TYPES, @@ -360,6 +372,7 @@ enum damos_filter_type { * @target_idx: Index of the &struct damon_target of * &damon_ctx->adaptive_targets if @type is * DAMOS_FILTER_TYPE_TARGET. + * @sz_range: Size range if @type is DAMOS_FILTER_TYPE_HUGEPAGE_SIZE. * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each @@ -376,6 +389,7 @@ struct damos_filter { unsigned short memcg_id; struct damon_addr_range addr_range; int target_idx; + struct damon_size_range sz_range; }; struct list_head list; }; -- cgit v1.2.3 From b2ae5fccb8c0ec2167e6e6c5c5a5eb8d656f70ef Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:38 -0800 Subject: mm: introduce vma_start_read_locked{_nested} helpers Patch series "reimplement per-vma lock as a refcount", v10. Back when per-vma locks were introduces, vm_lock was moved out of vm_area_struct in [1] because of the performance regression caused by false cacheline sharing. Recent investigation [2] revealed that the regressions is limited to a rather old Broadwell microarchitecture and even there it can be mitigated by disabling adjacent cacheline prefetching, see [3]. Splitting single logical structure into multiple ones leads to more complicated management, extra pointer dereferences and overall less maintainable code. When that split-away part is a lock, it complicates things even further. With no performance benefits, there are no reasons for this split. Merging the vm_lock back into vm_area_struct also allows vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. This patchset: 1. moves vm_lock back into vm_area_struct, aligning it at the cacheline boundary and changing the cache to be cacheline-aligned to minimize cacheline sharing; 2. changes vm_area_struct initialization to mark new vma as detached until it is inserted into vma tree; 3. replaces vm_lock and vma->detached flag with a reference counter; 4. regroups vm_area_struct members to fit them into 3 cachelines; 5. changes vm_area_struct cache to SLAB_TYPESAFE_BY_RCU to allow for their reuse and to minimize call_rcu() calls. Pagefault microbenchmarks show performance improvement: Hmean faults/cpu-1 507926.5547 ( 0.00%) 506519.3692 * -0.28%* Hmean faults/cpu-4 479119.7051 ( 0.00%) 481333.6802 * 0.46%* Hmean faults/cpu-7 452880.2961 ( 0.00%) 455845.6211 * 0.65%* Hmean faults/cpu-12 347639.1021 ( 0.00%) 352004.2254 * 1.26%* Hmean faults/cpu-21 200061.2238 ( 0.00%) 229597.0317 * 14.76%* Hmean faults/cpu-30 145251.2001 ( 0.00%) 164202.5067 * 13.05%* Hmean faults/cpu-48 106848.4434 ( 0.00%) 120641.5504 * 12.91%* Hmean faults/cpu-56 92472.3835 ( 0.00%) 103464.7916 * 11.89%* Hmean faults/sec-1 507566.1468 ( 0.00%) 506139.0811 * -0.28%* Hmean faults/sec-4 1880478.2402 ( 0.00%) 1886795.6329 * 0.34%* Hmean faults/sec-7 3106394.3438 ( 0.00%) 3140550.7485 * 1.10%* Hmean faults/sec-12 4061358.4795 ( 0.00%) 4112477.0206 * 1.26%* Hmean faults/sec-21 3988619.1169 ( 0.00%) 4577747.1436 * 14.77%* Hmean faults/sec-30 3909839.5449 ( 0.00%) 4311052.2787 * 10.26%* Hmean faults/sec-48 4761108.4691 ( 0.00%) 5283790.5026 * 10.98%* Hmean faults/sec-56 4885561.4590 ( 0.00%) 5415839.4045 * 10.85%* This patch (of 18): Introduce helper functions which can be used to read-lock a VMA when holding mmap_lock for read. Replace direct accesses to vma->vm_lock with these new helpers. Link: https://lkml.kernel.org/r/20250213224655.1680278-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Reviewed-by: Davidlohr Bueso Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fd1e85b4b48a..6a4914bc1a38 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -735,6 +735,30 @@ static inline bool vma_start_read(struct vm_area_struct *vma) return true; } +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +{ + mmap_assert_locked(vma->vm_mm); + down_read_nested(&vma->vm_lock->lock, subclass); +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline void vma_start_read_locked(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); + down_read(&vma->vm_lock->lock); +} + static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ -- cgit v1.2.3 From 7b6218ae1253491d56f21f4b1f3609f3dd873600 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:39 -0800 Subject: mm: move per-vma lock into vm_area_struct Back when per-vma locks were introduces, vm_lock was moved out of vm_area_struct in [1] because of the performance regression caused by false cacheline sharing. Recent investigation [2] revealed that the regressions is limited to a rather old Broadwell microarchitecture and even there it can be mitigated by disabling adjacent cacheline prefetching, see [3]. Splitting single logical structure into multiple ones leads to more complicated management, extra pointer dereferences and overall less maintainable code. When that split-away part is a lock, it complicates things even further. With no performance benefits, there are no reasons for this split. Merging the vm_lock back into vm_area_struct also allows vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. Move vm_lock back into vm_area_struct, aligning it at the cacheline boundary and changing the cache to be cacheline-aligned as well. With kernel compiled using defconfig, this causes VMA memory consumption to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes: slabinfo before: ... : ... vma_lock ... 40 102 1 : ... vm_area_struct ... 160 51 2 : ... slabinfo after moving vm_lock: ... : ... vm_area_struct ... 256 32 2 : ... Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages, which is 5.5MB per 100000 VMAs. Note that the size of this structure is dependent on the kernel configuration and typically the original size is higher than 160 bytes. Therefore these calculations are close to the worst case scenario. A more realistic vm_area_struct usage before this change is: ... : ... vma_lock ... 40 102 1 : ... vm_area_struct ... 176 46 2 : ... Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages, which is 3.9MB per 100000 VMAs. This memory consumption growth can be addressed later by optimizing the vm_lock. [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/ [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/ [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/ Link: https://lkml.kernel.org/r/20250213224655.1680278-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++------------ include/linux/mm_types.h | 6 ++++-- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6a4914bc1a38..bf3b6362927f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -697,6 +697,12 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK +static inline void vma_lock_init(struct vm_area_struct *vma) +{ + init_rwsem(&vma->vm_lock.lock); + vma->vm_lock_seq = UINT_MAX; +} + /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to @@ -714,7 +720,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) + if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) return false; /* @@ -729,7 +735,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { - up_read(&vma->vm_lock->lock); + up_read(&vma->vm_lock.lock); return false; } return true; @@ -744,7 +750,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { mmap_assert_locked(vma->vm_mm); - down_read_nested(&vma->vm_lock->lock, subclass); + down_read_nested(&vma->vm_lock.lock, subclass); } /* @@ -756,13 +762,13 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int static inline void vma_start_read_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); - down_read(&vma->vm_lock->lock); + down_read(&vma->vm_lock.lock); } static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->vm_lock->lock); + up_read(&vma->vm_lock.lock); rcu_read_unlock(); } @@ -791,7 +797,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->vm_lock->lock); + down_write(&vma->vm_lock.lock); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). @@ -799,7 +805,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock->lock); + up_write(&vma->vm_lock.lock); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -811,7 +817,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) static inline void vma_assert_locked(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_lock->lock)) + if (!rwsem_is_locked(&vma->vm_lock.lock)) vma_assert_write_locked(vma); } @@ -844,6 +850,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ +static inline void vma_lock_init(struct vm_area_struct *vma) {} static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} @@ -878,10 +885,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf) extern const struct vm_operations_struct vma_dummy_vm_ops; -/* - * WARNING: vma_init does not initialize vma->vm_lock. - * Use vm_area_alloc()/vm_area_free() if vma needs locking. - */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); @@ -890,6 +893,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); + vma_lock_init(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0234f14f2aa6..36dea20cd101 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -730,8 +730,6 @@ struct vm_area_struct { * slowpath. */ unsigned int vm_lock_seq; - /* Unstable RCU readers are allowed to read this. */ - struct vma_lock *vm_lock; #endif /* @@ -784,6 +782,10 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + struct vma_lock vm_lock ____cacheline_aligned_in_smp; +#endif } __randomize_layout; #ifdef CONFIG_NUMA -- cgit v1.2.3 From 8ef95d8f15f9e785341775814f0ed2ee22017aa5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:40 -0800 Subject: mm: mark vma as detached until it's added into vma tree Current implementation does not set detached flag when a VMA is first allocated. This does not represent the real state of the VMA, which is detached until it is added into mm's VMA tree. Fix this by marking new VMAs as detached and resetting detached flag only after VMA is added into a tree. Introduce vma_mark_attached() to make the API more readable and to simplify possible future cleanup when vma->vm_mm might be used to indicate detached vma and vma_mark_attached() will need an additional mm parameter. Link: https://lkml.kernel.org/r/20250213224655.1680278-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index bf3b6362927f..d333d4070362 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -821,12 +821,21 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) vma_assert_write_locked(vma); } -static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma->detached = false; +} + +static inline void vma_mark_detached(struct vm_area_struct *vma) { /* When detaching vma should be write-locked */ - if (detached) - vma_assert_write_locked(vma); - vma->detached = detached; + vma_assert_write_locked(vma); + vma->detached = true; +} + +static inline bool is_vma_detached(struct vm_area_struct *vma) +{ + return vma->detached; } static inline void release_fault_lock(struct vm_fault *vmf) @@ -857,8 +866,8 @@ static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } -static inline void vma_mark_detached(struct vm_area_struct *vma, - bool detached) {} +static inline void vma_mark_attached(struct vm_area_struct *vma) {} +static inline void vma_mark_detached(struct vm_area_struct *vma) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) @@ -891,7 +900,10 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_mark_detached(vma, false); +#ifdef CONFIG_PER_VMA_LOCK + /* vma is not locked, can't use vma_mark_detached() */ + vma->detached = true; +#endif vma_numab_state_init(vma); vma_lock_init(vma); } @@ -1086,6 +1098,7 @@ static inline int vma_iter_bulk_store(struct vma_iterator *vmi, if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; + vma_mark_attached(vma); return 0; } -- cgit v1.2.3 From 55e50223bf3e06abceaf68e2ad125458bb5f874f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:41 -0800 Subject: mm: introduce vma_iter_store_attached() to use with attached vmas vma_iter_store() functions can be used both when adding a new vma and when updating an existing one. However for existing ones we do not need to mark them attached as they are already marked that way. With vma->detached being a separate flag, double-marking a vmas as attached or detached is not an issue because the flag will simply be overwritten with the same value. However once we fold this flag into the refcount later in this series, re-attaching or re-detaching a vma becomes an issue since these operations will be incrementing/decrementing a refcount. Introduce vma_iter_store_new() and vma_iter_store_overwrite() to replace vma_iter_store() and avoid re-attaching a vma during vma update. Add assertions in vma_mark_attached()/vma_mark_detached() to catch invalid usage. Update vma tests to check for vma detached state correctness. Link: https://lkml.kernel.org/r/20250213224655.1680278-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d333d4070362..003c3e5c0a96 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -821,8 +821,19 @@ static inline void vma_assert_locked(struct vm_area_struct *vma) vma_assert_write_locked(vma); } +static inline void vma_assert_attached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(vma->detached); +} + +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(!vma->detached); +} + static inline void vma_mark_attached(struct vm_area_struct *vma) { + vma_assert_detached(vma); vma->detached = false; } @@ -830,6 +841,7 @@ static inline void vma_mark_detached(struct vm_area_struct *vma) { /* When detaching vma should be write-locked */ vma_assert_write_locked(vma); + vma_assert_attached(vma); vma->detached = true; } @@ -866,6 +878,8 @@ static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } +static inline void vma_assert_attached(struct vm_area_struct *vma) {} +static inline void vma_assert_detached(struct vm_area_struct *vma) {} static inline void vma_mark_attached(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma) {} -- cgit v1.2.3 From 2c2bd11caba2a45445c1f0c722451fe29e59db79 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:43 -0800 Subject: types: move struct rcuwait into types.h Move rcuwait struct definition into types.h so that rcuwait can be used without including rcuwait.h which includes other headers. Without this change mm_types.h can't use rcuwait due to a the following circular dependency: mm_types.h -> rcuwait.h -> signal.h -> mm_types.h Link: https://lkml.kernel.org/r/20250213224655.1680278-7-surenb@google.com Suggested-by: Matthew Wilcox Signed-off-by: Suren Baghdasaryan Acked-by: Davidlohr Bueso Acked-by: Liam R. Howlett Acked-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/rcuwait.h | 13 +------------ include/linux/types.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 27343424225c..9ad134a04b41 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -4,18 +4,7 @@ #include #include - -/* - * rcuwait provides a way of blocking and waking up a single - * task in an rcu-safe manner. - * - * The only time @task is non-nil is when a user is blocked (or - * checking if it needs to) on a condition, and reset as soon as we - * know that the condition has succeeded and are awoken. - */ -struct rcuwait { - struct task_struct __rcu *task; -}; +#include #define __RCUWAIT_INITIALIZER(name) \ { .task = NULL, } diff --git a/include/linux/types.h b/include/linux/types.h index 1c509ce8f7f6..a3d2182c2686 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -248,5 +248,17 @@ typedef void (*swap_func_t)(void *a, void *b, int size); typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv); typedef int (*cmp_func_t)(const void *a, const void *b); +/* + * rcuwait provides a way of blocking and waking up a single + * task in an rcu-safe manner. + * + * The only time @task is non-nil is when a user is blocked (or + * checking if it needs to) on a condition, and reset as soon as we + * know that the condition has succeeded and are awoken. + */ +struct rcuwait { + struct task_struct __rcu *task; +}; + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_TYPES_H */ -- cgit v1.2.3 From 7440adb405dfc4abe21bc95abf3481e6a6649c05 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:44 -0800 Subject: mm: allow vma_start_read_locked/vma_start_read_locked_nested to fail With upcoming replacement of vm_lock with vm_refcnt, we need to handle a possibility of vma_start_read_locked/vma_start_read_locked_nested failing due to refcount overflow. Prepare for such possibility by changing these APIs and adjusting their users. Link: https://lkml.kernel.org/r/20250213224655.1680278-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Cc: Lokesh Gidra Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 003c3e5c0a96..09b48af68699 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -747,10 +747,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ -static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { mmap_assert_locked(vma->vm_mm); down_read_nested(&vma->vm_lock.lock, subclass); + return true; } /* @@ -759,10 +760,11 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ -static inline void vma_start_read_locked(struct vm_area_struct *vma) +static inline bool vma_start_read_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); down_read(&vma->vm_lock.lock); + return true; } static inline void vma_end_read(struct vm_area_struct *vma) -- cgit v1.2.3 From ce0853966085dd8eab7153ce0b815c4a07d86698 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:45 -0800 Subject: mm: move mmap_init_lock() out of the header file mmap_init_lock() is used only from mm_init() in fork.c, therefore it does not have to reside in the header file. This move lets us avoid including additional headers in mmap_lock.h later, when mmap_init_lock() needs to initialize rcuwait object. Link: https://lkml.kernel.org/r/20250213224655.1680278-9-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 45a21faa3ff6..4706c6769902 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -122,12 +122,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int #endif /* CONFIG_PER_VMA_LOCK */ -static inline void mmap_init_lock(struct mm_struct *mm) -{ - init_rwsem(&mm->mmap_lock); - mm_lock_seqcount_init(mm); -} - static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); -- cgit v1.2.3 From 45ad9f5290dc4bb2249e951d4b3756d3ebda2d66 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:46 -0800 Subject: mm: uninline the main body of vma_start_write() vma_start_write() is used in many places and will grow in size very soon. It is not used in performance critical paths and uninlining it should limit the future code size growth. No functional changes. Link: https://lkml.kernel.org/r/20250213224655.1680278-10-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 09b48af68699..c24c521e38a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -787,6 +787,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l return (vma->vm_lock_seq == *mm_lock_seq); } +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); + /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently @@ -799,15 +801,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->vm_lock.lock); - /* - * We should use WRITE_ONCE() here because we can have concurrent reads - * from the early lockless pessimistic check in vma_start_read(). - * We don't really care about the correctness of that early check, but - * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. - */ - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock.lock); + __vma_start_write(vma, mm_lock_seq); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) -- cgit v1.2.3 From 7f8ceea0c58039dcea3d31b8d5da58aa5f6e12bf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:47 -0800 Subject: refcount: provide ops for cases when object's memory can be reused For speculative lookups where a successful inc_not_zero() pins the object, but where we still need to double check if the object acquired is indeed the one we set out to acquire (identity check), needs this validation to happen *after* the increment. Similarly, when a new object is initialized and its memory might have been previously occupied by another object, all stores to initialize the object should happen *before* refcount initialization. Notably SLAB_TYPESAFE_BY_RCU is one such an example when this ordering is required for reference counting. Add refcount_{add|inc}_not_zero_acquire() to guarantee the proper ordering between acquiring a reference count on an object and performing the identity check for that object. Add refcount_set_release() to guarantee proper ordering between stores initializing object attributes and the store initializing the refcount. refcount_set_release() should be done after all other object attributes are initialized. Once refcount_set_release() is called, the object should be considered visible to other tasks even if it was not yet added into an object collection normally used to discover it. This is because other tasks might have discovered the object previously occupying the same memory and after memory reuse they can succeed in taking refcount for the new object and start using it. Object reuse example to consider: consumer: obj = lookup(collection, key); if (!refcount_inc_not_zero_acquire(&obj->ref)) return; if (READ_ONCE(obj->key) != key) { /* identity check */ put_ref(obj); return; } use(obj->value); producer: remove(collection, obj->key); if (!refcount_dec_and_test(&obj->ref)) return; obj->key = KEY_INVALID; free(obj); obj = malloc(); /* obj is reused */ obj->key = new_key; obj->value = new_value; refcount_set_release(obj->ref, 1); add(collection, new_key, obj); refcount_{add|inc}_not_zero_acquire() is required to prevent the following reordering when refcount_inc_not_zero() is used instead: consumer: obj = lookup(collection, key); if (READ_ONCE(obj->key) != key) { /* reordered identity check */ put_ref(obj); return; } producer: remove(collection, obj->key); if (!refcount_dec_and_test(&obj->ref)) return; obj->key = KEY_INVALID; free(obj); obj = malloc(); /* obj is reused */ obj->key = new_key; obj->value = new_value; refcount_set_release(obj->ref, 1); add(collection, new_key, obj); if (!refcount_inc_not_zero(&obj->ref)) return; use(obj->value); /* USING WRONG OBJECT */ refcount_set_release() is required to prevent the following reordering when refcount_set() is used instead: consumer: obj = lookup(collection, key); producer: remove(collection, obj->key); if (!refcount_dec_and_test(&obj->ref)) return; obj->key = KEY_INVALID; free(obj); obj = malloc(); /* obj is reused */ obj->key = new_key; /* new_key == old_key */ refcount_set(obj->ref, 1); if (!refcount_inc_not_zero_acquire(&obj->ref)) return; if (READ_ONCE(obj->key) != key) { /* pass since new_key == old_key */ put_ref(obj); return; } use(obj->value); /* USING STALE obj->value */ obj->value = new_value; /* reordered store */ add(collection, key, obj); [surenb@google.com: fix title underlines in refcount-vs-atomic.rst] Link: https://lkml.kernel.org/r/20250217161645.3137927-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-11-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Vlastimil Babka [slab] Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Peter Zijlstra Cc: Will Deacon Cc: Paul E. McKenney Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/refcount.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/slab.h | 9 ++++ 2 files changed, 115 insertions(+) (limited to 'include/linux') diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 35f039ecb272..4589d2e7bfea 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -87,6 +87,15 @@ * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * + * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide + * acquire and release ordering for cases when the memory occupied by the + * object might be reused to store another object. This is important for the + * cases where secondary validation is required to detect such reuse, e.g. + * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after + * the refcount is taken, hence acquire order is necessary. Similarly, when the + * object is initialized, all stores to its attributes should be visible before + * the refcount is set, otherwise a stale attribute value might be used by + * another task which succeeds in taking a refcount to the new object. */ #ifndef _LINUX_REFCOUNT_H @@ -125,6 +134,31 @@ static inline void refcount_set(refcount_t *r, int n) atomic_set(&r->refs, n); } +/** + * refcount_set_release - set a refcount's value with release ordering + * @r: the refcount + * @n: value to which the refcount will be set + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides release memory ordering which will order previous memory operations + * against this store. This ensures all updates to this object are visible + * once the refcount is set and stale values from the object previously + * occupying this memory are overwritten with new ones. + * + * This function should be called only after new object is fully initialized. + * After this call the object should be considered visible to other tasks even + * if it was not yet added into an object collection normally used to discover + * it. This is because other tasks might have discovered the object previously + * occupying the same memory and after memory reuse they can succeed in taking + * refcount to the new object and start using it. + */ +static inline void refcount_set_release(refcount_t *r, int n) +{ + atomic_set_release(&r->refs, n); +} + /** * refcount_read - get a refcount's value * @r: the refcount @@ -178,6 +212,52 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) return __refcount_add_not_zero(i, r, NULL); } +static inline __must_check __signed_wrap +bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +{ + int old = refcount_read(r); + + do { + if (!old) + break; + } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); + + if (oldp) + *oldp = old; + + if (unlikely(old < 0 || old + i < 0)) + refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); + + return old; +} + +/** + * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0 + * + * @i: the value to add to the refcount + * @r: the refcount + * + * Will saturate at REFCOUNT_SATURATED and WARN. + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides acquire memory ordering on success, it is assumed the caller has + * guaranteed the object memory to be stable (RCU, etc.). It does provide a + * control dependency and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc_not_zero_acquire() should instead be used to increment a + * reference count. + * + * Return: false if the passed refcount is 0, true otherwise + */ +static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r) +{ + return __refcount_add_not_zero_acquire(i, r, NULL); +} + static inline __signed_wrap void __refcount_add(int i, refcount_t *r, int *oldp) { @@ -236,6 +316,32 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) return __refcount_inc_not_zero(r, NULL); } +static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_acquire(1, r, oldp); +} + +/** + * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0 + * @r: the refcount to increment + * + * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on + * success. + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides acquire memory ordering on success, it is assumed the caller has + * guaranteed the object memory to be stable (RCU, etc.). It does provide a + * control dependency and thereby orders future stores. See the comment on top. + * + * Return: true if the increment was successful, false otherwise + */ +static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r) +{ + return __refcount_inc_not_zero_acquire(r, NULL); +} + static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..ad902a2d692b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -136,6 +136,15 @@ enum _slab_flag_bits { * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. * + * Note that object identity check has to be done *after* acquiring a + * reference, therefore user has to ensure proper ordering for loads. + * Similarly, when initializing objects allocated with SLAB_TYPESAFE_BY_RCU, + * the newly allocated object has to be fully initialized *before* its + * refcount gets initialized and proper ordering for stores is required. + * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() are + * designed with the proper fences required for reference counting objects + * allocated with SLAB_TYPESAFE_BY_RCU. + * * Note that it is not possible to acquire a lock within a structure * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference * as described above. The reason is that SLAB_TYPESAFE_BY_RCU pages -- cgit v1.2.3 From 4e0dbe105d5088c77eb09de6f049aaf44711a2ec Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:48 -0800 Subject: refcount: introduce __refcount_{add|inc}_not_zero_limited_acquire Introduce functions to increase refcount but with a top limit above which they will fail to increase (the limit is inclusive). Setting the limit to INT_MAX indicates no limit. Link: https://lkml.kernel.org/r/20250213224655.1680278-12-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/refcount.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 4589d2e7bfea..80dc023ac2bf 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -213,13 +213,20 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) } static inline __must_check __signed_wrap -bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp, + int limit) { int old = refcount_read(r); do { if (!old) break; + + if (i > limit - old) { + if (oldp) + *oldp = old; + return false; + } } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); if (oldp) @@ -231,6 +238,18 @@ bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) return old; } +static inline __must_check bool +__refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit) +{ + return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit); +} + +static inline __must_check __signed_wrap +bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX); +} + /** * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0 * -- cgit v1.2.3 From f35ab95ca0af7a27feab57b9d7e906405bddb093 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:49 -0800 Subject: mm: replace vm_lock and detached flag with a reference count rw_semaphore is a sizable structure of 40 bytes and consumes considerable space for each vm_area_struct. However vma_lock has two important specifics which can be used to replace rw_semaphore with a simpler structure: 1. Readers never wait. They try to take the vma_lock and fall back to mmap_lock if that fails. 2. Only one writer at a time will ever try to write-lock a vma_lock because writers first take mmap_lock in write mode. Because of these requirements, full rw_semaphore functionality is not needed and we can replace rw_semaphore and the vma->detached flag with a refcount (vm_refcnt). When vma is in detached state, vm_refcnt is 0 and only a call to vma_mark_attached() can take it out of this state. Note that unlike before, now we enforce both vma_mark_attached() and vma_mark_detached() to be done only after vma has been write-locked. vma_mark_attached() changes vm_refcnt to 1 to indicate that it has been attached to the vma tree. When a reader takes read lock, it increments vm_refcnt, unless the top usable bit of vm_refcnt (0x40000000) is set, indicating presence of a writer. When writer takes write lock, it sets the top usable bit to indicate its presence. If there are readers, writer will wait using newly introduced mm->vma_writer_wait. Since all writers take mmap_lock in write mode first, there can be only one writer at a time. The last reader to release the lock will signal the writer to wake up. refcount might overflow if there are many competing readers, in which case read-locking will fail. Readers are expected to handle such failures. In summary: 1. all readers increment the vm_refcnt; 2. writer sets top usable (writer) bit of vm_refcnt; 3. readers cannot increment the vm_refcnt if the writer bit is set; 4. in the presence of readers, writer must wait for the vm_refcnt to drop to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma with no readers; 5. vm_refcnt overflow is handled by the readers. While this vm_lock replacement does not yet result in a smaller vm_area_struct (it stays at 256 bytes due to cacheline alignment), it allows for further size optimization by structure member regrouping to bring the size of vm_area_struct below 192 bytes. [surenb@google.com: fix a crash due to vma_end_read() that should have been removed] Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Peter Zijlstra Suggested-by: Matthew Wilcox Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 128 +++++++++++++++++++++++++++++++---------------- include/linux/mm_types.h | 22 ++++---- 2 files changed, 95 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c24c521e38a2..06f179c844c3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -697,19 +698,54 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK -static inline void vma_lock_init(struct vm_area_struct *vma) +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) { - init_rwsem(&vma->vm_lock.lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key lockdep_key; + + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); +#endif + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); vma->vm_lock_seq = UINT_MAX; } +static inline bool is_vma_writer_only(int refcnt) +{ + /* + * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma + * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on + * a detached vma happens only in vma_mark_detached() and is a rare + * case, therefore most of the time there will be no unnecessary wakeup. + */ + return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; +} + +static inline void vma_refcount_put(struct vm_area_struct *vma) +{ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *mm = vma->vm_mm; + int oldcnt; + + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { + + if (is_vma_writer_only(oldcnt - 1)) + rcuwait_wake_up(&mm->vma_writer_wait); + } +} + /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. */ -static inline bool vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) { + int oldcnt; + /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need @@ -718,15 +754,25 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) - return false; + return NULL; - if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) - return false; + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + return oldcnt ? NULL : ERR_PTR(-EAGAIN); + } + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); /* - * Overflow might produce false locked result. + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are @@ -735,10 +781,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { - up_read(&vma->vm_lock.lock); - return false; + vma_refcount_put(vma); + return NULL; } - return true; + + return vma; } /* @@ -749,8 +796,14 @@ static inline bool vma_start_read(struct vm_area_struct *vma) */ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { + int oldcnt; + mmap_assert_locked(vma->vm_mm); - down_read_nested(&vma->vm_lock.lock, subclass); + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); return true; } @@ -762,16 +815,12 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int */ static inline bool vma_start_read_locked(struct vm_area_struct *vma) { - mmap_assert_locked(vma->vm_mm); - down_read(&vma->vm_lock.lock); - return true; + return vma_start_read_locked_nested(vma, 0); } static inline void vma_end_read(struct vm_area_struct *vma) { - rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->vm_lock.lock); - rcu_read_unlock(); + vma_refcount_put(vma); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ @@ -813,38 +862,35 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) static inline void vma_assert_locked(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_lock.lock)) - vma_assert_write_locked(vma); + unsigned int mm_lock_seq; + + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && + !__is_vma_write_locked(vma, &mm_lock_seq), vma); } +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ static inline void vma_assert_attached(struct vm_area_struct *vma) { - WARN_ON_ONCE(vma->detached); + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { - WARN_ON_ONCE(!vma->detached); + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); } static inline void vma_mark_attached(struct vm_area_struct *vma) { - vma_assert_detached(vma); - vma->detached = false; -} - -static inline void vma_mark_detached(struct vm_area_struct *vma) -{ - /* When detaching vma should be write-locked */ vma_assert_write_locked(vma); - vma_assert_attached(vma); - vma->detached = true; + vma_assert_detached(vma); + refcount_set(&vma->vm_refcnt, 1); } -static inline bool is_vma_detached(struct vm_area_struct *vma) -{ - return vma->detached; -} +void vma_mark_detached(struct vm_area_struct *vma); static inline void release_fault_lock(struct vm_fault *vmf) { @@ -867,9 +913,9 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ -static inline void vma_lock_init(struct vm_area_struct *vma) {} -static inline bool vma_start_read(struct vm_area_struct *vma) - { return false; } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} +static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) + { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -910,12 +956,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); -#ifdef CONFIG_PER_VMA_LOCK - /* vma is not locked, can't use vma_mark_detached() */ - vma->detached = true; -#endif vma_numab_state_init(vma); - vma_lock_init(vma); + vma_lock_init(vma, false); } /* Use when VMA is not part of the VMA tree and needs no locking */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36dea20cd101..9de0a6cb3c2d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -629,9 +630,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) } #endif -struct vma_lock { - struct rw_semaphore lock; -}; +#define VMA_LOCK_OFFSET 0x40000000 +#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) struct vma_numab_state { /* @@ -709,19 +709,13 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* - * Flag to indicate areas detached from the mm->mm_mt tree. - * Unstable RCU readers are allowed to read this. - */ - bool detached; - /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock->lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock->lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -784,7 +778,10 @@ struct vm_area_struct { struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef CONFIG_PER_VMA_LOCK /* Unstable RCU readers are allowed to read this. */ - struct vma_lock vm_lock ____cacheline_aligned_in_smp; + refcount_t vm_refcnt ____cacheline_aligned_in_smp; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map vmlock_dep_map; +#endif #endif } __randomize_layout; @@ -920,6 +917,7 @@ struct mm_struct { * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK + struct rcuwait vma_writer_wait; /* * This field has lock-like semantics, meaning it is sometimes * accessed with ACQUIRE/RELEASE semantics. -- cgit v1.2.3 From 6bef4c2f97221f3b595d08c8656eb5845ef80fe9 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:50 -0800 Subject: mm: move lesser used vma_area_struct members into the last cacheline Move several vma_area_struct members which are rarely or never used during page fault handling into the last cacheline to better pack vm_area_struct. As a result vm_area_struct will fit into 3 as opposed to 4 cachelines. New typical vm_area_struct layout: struct vm_area_struct { union { struct { long unsigned int vm_start; /* 0 8 */ long unsigned int vm_end; /* 8 8 */ }; /* 0 16 */ freeptr_t vm_freeptr; /* 0 8 */ }; /* 0 16 */ struct mm_struct * vm_mm; /* 16 8 */ pgprot_t vm_page_prot; /* 24 8 */ union { const vm_flags_t vm_flags; /* 32 8 */ vm_flags_t __vm_flags; /* 32 8 */ }; /* 32 8 */ unsigned int vm_lock_seq; /* 40 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head anon_vma_chain; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct anon_vma * anon_vma; /* 64 8 */ const struct vm_operations_struct * vm_ops; /* 72 8 */ long unsigned int vm_pgoff; /* 80 8 */ struct file * vm_file; /* 88 8 */ void * vm_private_data; /* 96 8 */ atomic_long_t swap_readahead_info; /* 104 8 */ struct mempolicy * vm_policy; /* 112 8 */ struct vma_numab_state * numab_state; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ refcount_t vm_refcnt (__aligned__(64)); /* 128 4 */ /* XXX 4 bytes hole, try to pack */ struct { struct rb_node rb (__aligned__(8)); /* 136 24 */ long unsigned int rb_subtree_last; /* 160 8 */ } __attribute__((__aligned__(8))) shared; /* 136 32 */ struct anon_vma_name * anon_name; /* 168 8 */ struct vm_userfaultfd_ctx vm_userfaultfd_ctx; /* 176 8 */ /* size: 192, cachelines: 3, members: 18 */ /* sum members: 176, holes: 2, sum holes: 8 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 4 */ } __attribute__((__aligned__(64))); Memory consumption per 1000 VMAs becomes 48 pages: slabinfo after vm_area_struct changes: ... : ... vm_area_struct ... 192 42 2 : ... Link: https://lkml.kernel.org/r/20250213224655.1680278-14-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9de0a6cb3c2d..c3aa0e20be41 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -725,17 +725,6 @@ struct vm_area_struct { */ unsigned int vm_lock_seq; #endif - - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree. - * - */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma @@ -755,14 +744,6 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ -#ifdef CONFIG_ANON_VMA_NAME - /* - * For private and shared anonymous mappings, a pointer to a null - * terminated string containing the name given to the vma, or NULL if - * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. - */ - struct anon_vma_name *anon_name; -#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif @@ -775,7 +756,6 @@ struct vm_area_struct { #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif - struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef CONFIG_PER_VMA_LOCK /* Unstable RCU readers are allowed to read this. */ refcount_t vm_refcnt ____cacheline_aligned_in_smp; @@ -783,6 +763,24 @@ struct vm_area_struct { struct lockdep_map vmlock_dep_map; #endif #endif + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; #ifdef CONFIG_NUMA -- cgit v1.2.3 From e218d9fedd056a0b17468d6e19fddaf2c3550f97 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:52 -0800 Subject: mm: remove extra vma_numab_state_init() call vma_init() already memset's the whole vm_area_struct to 0, so there is no need to an additional vma_numab_state_init(). Link: https://lkml.kernel.org/r/20250213224655.1680278-16-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 06f179c844c3..aad932c4bcf0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -956,7 +956,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_numab_state_init(vma); vma_lock_init(vma, false); } -- cgit v1.2.3 From e49510bf00de4f832eaaebcfc113795f127ad519 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:53 -0800 Subject: mm: prepare lock_vma_under_rcu() for vma reuse possibility Once we make vma cache SLAB_TYPESAFE_BY_RCU, it will be possible for a vma to be reused and attached to another mm after lock_vma_under_rcu() locks the vma. lock_vma_under_rcu() should ensure that vma_start_read() is using the original mm and after locking the vma it should ensure that vma->vm_mm has not changed from under us. Link: https://lkml.kernel.org/r/20250213224655.1680278-17-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index aad932c4bcf0..e3f962de1677 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -739,10 +739,13 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. */ -static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) { int oldcnt; @@ -753,7 +756,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) return NULL; /* @@ -780,7 +783,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { vma_refcount_put(vma); return NULL; } @@ -914,7 +917,8 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} -- cgit v1.2.3 From 3104138517fc66aad21f4a2487bb572e9fc2e3ec Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:54 -0800 Subject: mm: make vma cache SLAB_TYPESAFE_BY_RCU To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that object reuse before RCU grace period is over will be detected by lock_vma_under_rcu(). Current checks are sufficient as long as vma is detached before it is freed. The only place this is not currently happening is in exit_mmap(). Add the missing vma_mark_detached() in exit_mmap(). Another issue which might trick lock_vma_under_rcu() during vma reuse is vm_area_dup(), which copies the entire content of the vma into a new one, overriding new vma's vm_refcnt and temporarily making it appear as attached. This might trick a racing lock_vma_under_rcu() to operate on a reused vma if it found the vma before it got reused. To prevent this situation, we should ensure that vm_refcnt stays at detached state (0) when it is copied and advances to attached state only after it is added into the vma tree. Introduce vm_area_init_from() which preserves new vma's vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached state with no current readers when they are freed, lock_vma_under_rcu() will not be able to take vm_refcnt after vma got detached even if vma is reused. vma_mark_attached() in modified to include a release fence to ensure all stores to the vma happen before vm_refcnt gets initialized. Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate vm_area_struct reuse and will minimize the number of call_rcu() calls. [surenb@google.com: remove atomic_set_release() usage in tools/] Link: https://lkml.kernel.org/r/20250217054351.2973666-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-18-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 +--- include/linux/mm_types.h | 13 ++++++++++--- include/linux/slab.h | 6 ------ 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e3f962de1677..14115c9949d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code, struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); -/* Use only if VMA has no other users */ -void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; @@ -890,7 +888,7 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_detached(vma); - refcount_set(&vma->vm_refcnt, 1); + refcount_set_release(&vma->vm_refcnt, 1); } void vma_mark_detached(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3aa0e20be41..6a93abb4452b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -574,6 +574,12 @@ static inline void *folio_get_private(struct folio *folio) typedef unsigned long vm_flags_t; +/* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that @@ -677,6 +683,9 @@ struct vma_numab_state { * * Only explicitly marked struct members may be accessed by RCU readers before * getting a stable reference. + * + * WARNING: when adding new members, please update vm_area_init_from() to copy + * them during vm_area_struct content duplication. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -687,9 +696,7 @@ struct vm_area_struct { unsigned long vm_start; unsigned long vm_end; }; -#ifdef CONFIG_PER_VMA_LOCK - struct rcu_head vm_rcu; /* Used for deferred freeing. */ -#endif + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; /* diff --git a/include/linux/slab.h b/include/linux/slab.h index ad902a2d692b..f8924fd6ea26 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -243,12 +243,6 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif -/* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * -- cgit v1.2.3 From 63a23847dc47113b879a5f53cc0ca5cedc881ffd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:06 +0000 Subject: fs: convert block_commit_write() to take a folio All callers now have a folio, so pass it in instead of converting folio->page->folio. Link: https://lkml.kernel.org/r/20250217192009.437916-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 932139c5d46f..6672e1a5031c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -271,7 +271,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); -void block_commit_write(struct page *page, unsigned int from, unsigned int to); +void block_commit_write(struct folio *folio, size_t from, size_t to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); -- cgit v1.2.3 From 52d671a1a36a16f3a0dd9a2beff964e75bce9787 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:07 +0000 Subject: fs: remove page_file_mapping() This wrapper has no more callers. Delete it. Link: https://lkml.kernel.org/r/20250217192009.437916-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..975c56fb4f85 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -575,11 +575,6 @@ static inline struct address_space *folio_flush_mapping(struct folio *folio) return folio_mapping(folio); } -static inline struct address_space *page_file_mapping(struct page *page) -{ - return folio_file_mapping(page_folio(page)); -} - /** * folio_inode - Get the host inode for this folio. * @folio: The folio. -- cgit v1.2.3 From 0d40cfe63a2f19b9d375382e6d90b9ebd412901e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:08 +0000 Subject: fs: remove folio_file_mapping() No callers of this function remain as filesystems no longer see swapfile pages through their normal read/write paths. Link: https://lkml.kernel.org/r/20250217192009.437916-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 975c56fb4f85..ad7c0f615e9b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -535,26 +535,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -/** - * folio_file_mapping - Find the mapping this folio belongs to. - * @folio: The folio. - * - * For folios which are in the page cache, return the mapping that this - * page belongs to. Folios in the swap cache return the mapping of the - * swap file or swap device where the data is stored. This is different - * from the mapping returned by folio_mapping(). The only reason to - * use it is if, like NFS, you return 0 from ->activate_swapfile. - * - * Do not call this for folios which aren't in the page cache or swap cache. - */ -static inline struct address_space *folio_file_mapping(struct folio *folio) -{ - if (unlikely(folio_test_swapcache(folio))) - return swapcache_mapping(folio); - - return folio->mapping; -} - /** * folio_flush_mapping - Find the file mapping this folio belongs to. * @folio: The folio. -- cgit v1.2.3 From c009da4258f9885c5a3749fc004870db9c0e7a99 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:03 +0000 Subject: mm, cma: support multiple contiguous ranges, if requested Currently, CMA manages one range of physically contiguous memory. Creation of larger CMA areas with hugetlb_cma may run in to gaps in physical memory, so that they are not able to allocate that contiguous physical range from memblock when creating the CMA area. This can happen, for example, on an AMD system with > 1TB of memory, where there will be a gap just below the 1TB (40bit DMA) line. If you have set aside most of memory for potential hugetlb CMA allocation, cma_declare_contiguous_nid will fail. hugetlb_cma doesn't need the entire area to be one physically contiguous range. It just cares about being able to get physically contiguous chunks of a certain size (e.g. 1G), and it is fine to have the CMA area backed by multiple physical ranges, as long as it gets 1G contiguous allocations. Multi-range support is implemented by introducing an array of ranges, instead of just one big one. Each range has its own bitmap. Effectively, the allocate and release operations work as before, just per-range. So, instead of going through one large bitmap, they now go through a number of smaller ones. The maximum number of supported ranges is 8, as defined in CMA_MAX_RANGES. Since some current users of CMA expect a CMA area to just use one physically contiguous range, only allow for multiple ranges if a new interface, cma_declare_contiguous_nid_multi, is used. The other interfaces will work like before, creating only CMA areas with 1 range. cma_declare_contiguous_nid_multi works as follows, mimicking the default "bottom-up, above 4G" reservation approach: 0) Try cma_declare_contiguous_nid, which will use only one region. If this succeeds, return. This makes sure that for all the cases that currently work, the behavior remains unchanged even if the caller switches from cma_declare_contiguous_nid to cma_declare_contiguous_nid_multi. 1) Select the largest free memblock ranges above 4G, with a maximum number of CMA_MAX_RANGES. 2) If we did not find at most CMA_MAX_RANGES that add up to the total size requested, return -ENOMEM. 3) Sort the selected ranges by base address. 4) Reserve them bottom-up until we get what we wanted. Link: https://lkml.kernel.org/r/20250228182928.2645936-3-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Arnd Bergmann Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/cma.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index d15b64f51336..863427c27dc2 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -40,6 +40,9 @@ static inline int __init cma_declare_contiguous(phys_addr_t base, return cma_declare_contiguous_nid(base, size, limit, alignment, order_per_bit, fixed, name, res_cma, NUMA_NO_NODE); } +extern int __init cma_declare_contiguous_multi(phys_addr_t size, + phys_addr_t align, unsigned int order_per_bit, + const char *name, struct cma **res_cma, int nid); extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, -- cgit v1.2.3 From 624ab90b7b8758090654520b03c97cecc438a414 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:04 +0000 Subject: mm/cma: introduce cma_intersects function Now that CMA areas can have multiple physical ranges, code can't assume a CMA struct represents a base_pfn plus a size, as returned from cma_get_base. Most cases are ok though, since they all explicitly refer to CMA areas that were created using existing interfaces (cma_declare_contiguous_nid or cma_init_reserved_mem), which guarantees they have just one physical range. An exception is the s390 code, which walks all CMA ranges to see if they intersect with a range of memory that is about to be hotremoved. So, in the future, it might run in to multi-range areas. To keep this check working, define a cma_intersects function. This just checks if a physaddr range intersects any of the ranges. Use it in the s390 check. Link: https://lkml.kernel.org/r/20250228182928.2645936-4-fvdl@google.com Signed-off-by: Frank van der Linden Acked-by: Alexander Gordeev Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/cma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index 863427c27dc2..03d85c100dcc 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -53,6 +53,7 @@ extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); +extern bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end); extern void cma_reserve_pages_on_error(struct cma *cma); -- cgit v1.2.3 From 5b47c02967ab770aa7661c8863a21b2fd59e35ff Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:08 +0000 Subject: mm/hugetlb: convert cmdline parameters from setup to early Convert the cmdline parameters (hugepagesz, hugepages, default_hugepagesz and hugetlb_free_vmemmap) to early parameters. Since parse_early_param might run before MMU setups on some platforms (powerpc), validation of huge page sizes as specified in command line parameters would fail. So instead, for the hstate-related values, just record the them and parse them on demand, from hugetlb_bootmem_alloc. The allocation of hugetlb bootmem pages is now done in hugetlb_bootmem_alloc, which is called explicitly at the start of mm_core_init(). core_initcall would be too late, as that happens with memblock already torn down. This change will allow earlier allocation and initialization of bootmem hugetlb pages later on. No functional change intended. Link: https://lkml.kernel.org/r/20250228182928.2645936-8-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 76a75ec03dd6..a596aaa178d1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -174,6 +174,8 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; +void hugetlb_bootmem_alloc(void); + /* arch callbacks */ #ifndef CONFIG_HIGHPTE @@ -1257,6 +1259,10 @@ static inline bool hugetlbfs_pagecache_present( { return false; } + +static inline void hugetlb_bootmem_alloc(void) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, -- cgit v1.2.3 From 243a75e236802614075511266c8d1834d4bcffe9 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:10 +0000 Subject: mm/bootmem_info: export register_page_bootmem_memmap If other mm code wants to use this function for early memmap inialization (on the platforms that have it), it should be made available properly, not just unconditionally in mm.h Make this function available for such cases. Link: https://lkml.kernel.org/r/20250228182928.2645936-10-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/bootmem_info.h | 7 +++++++ include/linux/mm.h | 3 --- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index d8a8d245824a..4c506e76a808 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -18,6 +18,8 @@ enum bootmem_type { #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE void __init register_page_bootmem_info_node(struct pglist_data *pgdat); +void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, + unsigned long nr_pages); void get_page_bootmem(unsigned long info, struct page *page, enum bootmem_type type); @@ -58,6 +60,11 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } +static inline void register_page_bootmem_memmap(unsigned long section_nr, + struct page *map, unsigned long nr_pages) +{ +} + static inline void put_page_bootmem(struct page *page) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 14115c9949d8..e90ab0bdcc8c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4018,9 +4018,6 @@ static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, } #endif -void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, - unsigned long nr_pages); - enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, -- cgit v1.2.3 From d65917c42373f70159a3fc453f8f028fd665e04f Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:11 +0000 Subject: mm/sparse: allow for alternate vmemmap section init at boot Add functions that are called just before the per-section memmap is initialized and just before the memmap page structures are initialized. They are called sparse_vmemmap_init_nid_early and sparse_vmemmap_init_nid_late, respectively. This allows for mm subsystems to add calls to initialize memmap and page structures in a specific way, if using SPARSEMEM_VMEMMAP. Specifically, hugetlb can pre-HVO bootmem allocated pages that way, so that no time and resources are wasted on allocating vmemmap pages, only to free them later (and possibly unnecessarily running the system out of memory in the process). Refactor some code and export a few convenience functions for external use. In sparse_init_nid, skip any sections that are already initialized, e.g. they have been initialized by sparse_vmemmap_init_nid_early already. The hugetlb code to use these functions will be added in a later commit. Export section_map_size, as any alternate memmap init code will want to use it. The internal config option to enable this is SPARSEMEM_VMEMMAP_PREINIT, which is selected if an architecture-specific option, ARCH_WANT_HUGETLB_VMEMMAP_PREINIT, is set. In the future, if other subsystems want to do preinit too, they can do it in a similar fashion. The internal config option is there because a section flag is used, and the number of flags available is architecture-dependent (see mmzone.h). Architecures can decide if there is room for the flag when enabling options that select SPARSEMEM_VMEMMAP_PREINIT. Fortunately, as of right now, all sparse vmemmap using architectures do have room. Link: https://lkml.kernel.org/r/20250228182928.2645936-11-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Johannes Weiner Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + include/linux/mmzone.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e90ab0bdcc8c..03e4807bd911 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3928,6 +3928,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) #endif void *sparse_buffer_alloc(unsigned long size); +unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9540b41894da..44ecb2f90db4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1933,6 +1933,9 @@ enum { SECTION_IS_EARLY_BIT, #ifdef CONFIG_ZONE_DEVICE SECTION_TAINT_ZONE_DEVICE_BIT, +#endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT + SECTION_IS_VMEMMAP_PREINIT_BIT, #endif SECTION_MAP_LAST_BIT, }; @@ -1944,6 +1947,9 @@ enum { #ifdef CONFIG_ZONE_DEVICE #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT +#define SECTION_IS_VMEMMAP_PREINIT BIT(SECTION_IS_VMEMMAP_PREINIT_BIT) +#endif #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT @@ -1998,6 +2004,30 @@ static inline int online_device_section(struct mem_section *section) } #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT +static inline int preinited_vmemmap_section(struct mem_section *section) +{ + return (section && + (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); +} + +void sparse_vmemmap_init_nid_early(int nid); +void sparse_vmemmap_init_nid_late(int nid); + +#else +static inline int preinited_vmemmap_section(struct mem_section *section) +{ + return 0; +} +static inline void sparse_vmemmap_init_nid_early(int nid) +{ +} + +static inline void sparse_vmemmap_init_nid_late(int nid) +{ +} +#endif + static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); @@ -2035,6 +2065,9 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) } #endif +void sparse_init_early_section(int nid, struct page *map, unsigned long pnum, + unsigned long flags); + #ifndef CONFIG_HAVE_ARCH_PFN_VALID /** * pfn_valid - check if there is a valid memory map entry for a PFN @@ -2116,6 +2149,8 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) +#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0) +#define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ -- cgit v1.2.3 From 9eb6207b7812cee13431831c9661d21b53f3e93f Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:15 +0000 Subject: mm/sparse: add vmemmap_*_hvo functions Add a few functions to enable early HVO: vmemmap_populate_hvo vmemmap_undo_hvo vmemmap_wrprotect_hvo The populate and undo functions are expected to be used in early init, from the sparse_init_nid_early() function. The wrprotect function is to be used, potentially, later. To implement these functions, mostly re-use the existing compound pages vmemmap logic used by DAX. vmemmap_populate_address has its argument changed a bit in this commit: the page structure passed in to be reused in the mapping is replaced by a PFN and a flag. The flag indicates whether an extra ref should be taken on the vmemmap page containing the head page structure. Taking the ref is appropriate to for DAX / ZONE_DEVICE, but not for HugeTLB HVO. The HugeTLB vmemmap optimization maps tail page structure pages read-only. The vmemmap_wrprotect_hvo function that does this is implemented separately, because it cannot be guaranteed that reserved page structures will not be write accessed during memory initialization. Even with CONFIG_DEFERRED_STRUCT_PAGE_INIT, they might still be written to (if they are at the bottom of a zone). So, vmemmap_populate_hvo leaves the tail page structure pages RW initially, and then later during initialization, after memmap init is fully done, vmemmap_wrprotect_hvo must be called to finish the job. Subsequent commits will use these functions for early HugeTLB HVO. Link: https://lkml.kernel.org/r/20250228182928.2645936-15-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 03e4807bd911..9a74a3ee68bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3937,7 +3937,8 @@ p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, - struct vmem_altmap *altmap, struct page *reuse); + struct vmem_altmap *altmap, unsigned long ptpfn, + unsigned long flags); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, @@ -3953,6 +3954,12 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, -- cgit v1.2.3 From d58b2498200724e4f8c12d71a5953da03c8c8bdf Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:16 +0000 Subject: mm/hugetlb: deal with multiple calls to hugetlb_bootmem_alloc Architectures that want pre-HVO of hugetlb vmemmap pages will need to call hugetlb_bootmem_alloc from an earlier spot in boot (before sparse_init). To facilitate some architectures doing this, protect hugetlb_bootmem_alloc against multiple calls. Also provide a helper function to check if it's been called, so that the early HVO code, to be added later, can see if there is anything to do. Link: https://lkml.kernel.org/r/20250228182928.2645936-16-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a596aaa178d1..f0ab4ca4ecf2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -175,6 +175,7 @@ extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); +bool hugetlb_bootmem_allocated(void); /* arch callbacks */ @@ -1263,6 +1264,11 @@ static inline bool hugetlbfs_pagecache_present( static inline void hugetlb_bootmem_alloc(void) { } + +static inline bool hugetlb_bootmem_allocated(void) +{ + return false; +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, -- cgit v1.2.3 From 752fe17af693323dba5622b19c858a46fed219a1 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:18 +0000 Subject: mm/hugetlb: add pre-HVO framework Define flags for pre-HVOed bootmem hugetlb pages, and act on them. The most important flag is the HVO flag, signalling that a bootmem allocated gigantic page has already been HVO-ed. If this flag is seen by the hugetlb bootmem gather code, the page is marked as HVO optimized. The HVO code will then not try to optimize it again. Instead, it will just map the tail page mirror pages read-only, completing the HVO steps. No functional change, as nothing sets the flags yet. Link: https://lkml.kernel.org/r/20250228182928.2645936-18-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f0ab4ca4ecf2..bbccc3e6b9dd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -681,8 +681,12 @@ struct hstate { struct huge_bootmem_page { struct list_head list; struct hstate *hstate; + unsigned long flags; }; +#define HUGE_BOOTMEM_HVO 0x0001 +#define HUGE_BOOTMEM_ZONES_VALID 0x0002 + int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); -- cgit v1.2.3 From b1222550fbf73840e31a103305d03ad53b8f5a59 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:20 +0000 Subject: mm/hugetlb: do pre-HVO for bootmem allocated pages For large systems, the overhead of vmemmap pages for hugetlb is substantial. It's about 1.5% of memory, which is about 45G for a 3T system. If you want to configure most of that system for hugetlb (e.g. to use as backing memory for VMs), there is a chance of running out of memory on boot, even though you know that the 45G will become available later. To avoid this scenario, and since it's a waste to first allocate and then free that 45G during boot, do pre-HVO for hugetlb bootmem allocated pages ('gigantic' pages). pre-HVO is done by adding functions that are called from sparse_init_nid_early and sparse_init_nid_late. The first is called before memmap allocation, so it takes care of allocating memmap HVO-style. The second verifies that all bootmem pages look good, specifically it checks that they do not intersect with multiple zones. This can only be done from sparse_init_nid_late path, when zones have been initialized. The hugetlb page size must be aligned to the section size, and aligned to the size of memory described by the number of page structures contained in one PMD (since pre-HVO is not prepared to split PMDs). This should be true for most 'gigantic' pages, it is for 1G pages on x86, where both of these alignment requirements are 128M. This will only have an effect if hugetlb_bootmem_alloc was called early in boot. If not, it won't do anything, and HVO for bootmem hugetlb pages works as before. Link: https://lkml.kernel.org/r/20250228182928.2645936-20-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bbccc3e6b9dd..f6b82b0524ed 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -687,6 +687,8 @@ struct huge_bootmem_page { #define HUGE_BOOTMEM_HVO 0x0001 #define HUGE_BOOTMEM_ZONES_VALID 0x0002 +bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); + int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); -- cgit v1.2.3 From 9320fa2717810a7d3451dd1a18c31f986a1d4068 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:24 +0000 Subject: mm/cma: introduce a cma validate function Define a function to check if a CMA area is valid, which means: do its ranges not cross any zone boundaries. Store the result in the newly created flags for each CMA area, so that multiple calls are dealt with. This allows for checking the validity of a CMA area early, which is needed later in order to be able to allocate hugetlb bootmem pages from it with pre-HVO. Link: https://lkml.kernel.org/r/20250228182928.2645936-24-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/cma.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index 03d85c100dcc..62d9c1cf6326 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -60,6 +60,7 @@ extern void cma_reserve_pages_on_error(struct cma *cma); #ifdef CONFIG_CMA struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); bool cma_free_folio(struct cma *cma, const struct folio *folio); +bool cma_validate_zones(struct cma *cma); #else static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) { @@ -70,6 +71,10 @@ static inline bool cma_free_folio(struct cma *cma, const struct folio *folio) { return false; } +static inline bool cma_validate_zones(struct cma *cma) +{ + return false; +} #endif #endif -- cgit v1.2.3 From d2d78671408061b5332d9ad357686812bbec5070 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Fri, 28 Feb 2025 18:29:27 +0000 Subject: mm/hugetlb: enable bootmem allocation from CMA areas If hugetlb_cma_only is enabled, we know that hugetlb pages can only be allocated from CMA. Now that there is an interface to do early reservations from a CMA area (returning memblock memory), it can be used to allocate hugetlb pages from CMA. This also allows for doing pre-HVO on these pages (if enabled). Make sure to initialize the page structures and associated data correctly. Create a flag to signal that a hugetlb page has been allocated from CMA to make things a little easier. Some configurations of powerpc have a special hugetlb bootmem allocator, so introduce a boolean arch_specific_huge_bootmem_alloc that returns true if such an allocator is present. In that case, CMA bootmem allocations can't be used, so check that function before trying. Link: https://lkml.kernel.org/r/20250228182928.2645936-27-fvdl@google.com Signed-off-by: Frank van der Linden Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Dave Hansen Cc: David Hildenbrand Cc: Heiko Carstens Cc: Joao Martins Cc: Johannes Weiner Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Roman Gushchin (Cruise) Cc: Usama Arif Cc: Vasily Gorbik Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f6b82b0524ed..8f3ac832ee7f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -591,6 +591,7 @@ enum hugetlb_page_flags { HPG_freed, HPG_vmemmap_optimized, HPG_raw_hwp_unreliable, + HPG_cma, __NR_HPAGEFLAGS, }; @@ -650,6 +651,7 @@ HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) +HPAGEFLAG(Cma, cma) #ifdef CONFIG_HUGETLB_PAGE @@ -678,14 +680,18 @@ struct hstate { char name[HSTATE_NAME_LEN]; }; +struct cma; + struct huge_bootmem_page { struct list_head list; struct hstate *hstate; unsigned long flags; + struct cma *cma; }; #define HUGE_BOOTMEM_HVO 0x0001 #define HUGE_BOOTMEM_ZONES_VALID 0x0002 +#define HUGE_BOOTMEM_CMA 0x0004 bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); @@ -824,6 +830,17 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, } #endif +#ifndef arch_has_huge_bootmem_alloc +/* + * Some architectures do their own bootmem allocation, so they can't use + * early CMA allocation. + */ +static inline bool arch_has_huge_bootmem_alloc(void) +{ + return false; +} +#endif + static inline struct hstate *folio_hstate(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); -- cgit v1.2.3 From f809b9f3046f702bc1ae40695acb27c1b0abc346 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 19 Feb 2025 14:01:45 -0800 Subject: mm/damon: implement a new DAMOS filter type for unmapped pages Patch series "mm/damon: introduce DAMOS filter type for unmapped pages". User decides whether their memory will be mapped or unmapped. It implies that the two types of memory can have different characteristics and management requirements. Provide the DAMON-observaibility DAMOS-operation capability for the different types by introducing a new DAMOS filter type for unmapped pages. This patch (of 2): Implement yet another DAMOS filter type for unmapped pages on DAMON kernel API, and add support of it from the physical address space DAMON operations set (paddr). Since it is for only unmapped pages, support from the virtual address spaces DAMON operations set (vaddr) is not required. Link: https://lkml.kernel.org/r/20250219220146.133650-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250219220146.133650-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5e7ae7bca5dc..242910b190c9 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -337,6 +337,7 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_YOUNG: Recently accessed pages. * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: Page is part of a hugepage. + * @DAMOS_FILTER_TYPE_UNMAPPED: Unmapped pages. * @DAMOS_FILTER_TYPE_ADDR: Address range. * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target. * @NR_DAMOS_FILTER_TYPES: Number of filter types. @@ -357,6 +358,7 @@ enum damos_filter_type { DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_YOUNG, DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, + DAMOS_FILTER_TYPE_UNMAPPED, DAMOS_FILTER_TYPE_ADDR, DAMOS_FILTER_TYPE_TARGET, NR_DAMOS_FILTER_TYPES, -- cgit v1.2.3 From 58abac769b05f6e972d34a02a46a04589d6e9853 Mon Sep 17 00:00:00 2001 From: Liu Ye Date: Wed, 12 Feb 2025 10:58:42 +0800 Subject: mm/folio_queue: delete __folio_order and use folio_order directly __folio_order is the same as folio_order, remove __folio_order and then just include mm.h and use folio_order directly. Link: https://lkml.kernel.org/r/20250212025843.80283-2-liuye@kylinos.cn Signed-off-by: Liu Ye Reviewed-by: Shivank Garg Reviewed-by: Dev Jain Acked-by: David Howells Cc: Christian Brauner Signed-off-by: Andrew Morton --- include/linux/folio_queue.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 4d3f8074c137..45ad2408a80c 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -15,6 +15,7 @@ #define _LINUX_FOLIO_QUEUE_H #include +#include /* * Segment in a queue of running buffers. Each segment can hold a number of @@ -216,13 +217,6 @@ static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot) clear_bit(slot, &folioq->marks3); } -static inline unsigned int __folio_order(struct folio *folio) -{ - if (!folio_test_large(folio)) - return 0; - return folio->_flags_1 & 0xff; -} - /** * folioq_append: Add a folio to a folio queue segment * @folioq: The segment to add to @@ -241,7 +235,7 @@ static inline unsigned int folioq_append(struct folio_queue *folioq, struct foli unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; - folioq->orders[slot] = __folio_order(folio); + folioq->orders[slot] = folio_order(folio); return slot; } @@ -263,7 +257,7 @@ static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct unsigned int slot = folioq->vec.nr++; folioq->vec.folios[slot] = folio; - folioq->orders[slot] = __folio_order(folio); + folioq->orders[slot] = folio_order(folio); folioq_mark(folioq, slot); return slot; } -- cgit v1.2.3 From 44f76413496ec343da0d8292ceecdcabe3e6ec16 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 3 Mar 2025 11:03:23 +0900 Subject: zsmalloc: introduce new object mapping API Current object mapping API is a little cumbersome. First, it's inconsistent, sometimes it returns with page-faults disabled and sometimes with page-faults enabled. Second, and most importantly, it enforces atomicity restrictions on its users. zs_map_object() has to return a liner object address which is not always possible because some objects span multiple physical (non-contiguous) pages. For such objects zsmalloc uses a per-CPU buffer to which object's data is copied before a pointer to that per-CPU buffer is returned back to the caller. This leads to another, final, issue - extra memcpy(). Since the caller gets a pointer to per-CPU buffer it can memcpy() data only to that buffer, and during zs_unmap_object() zsmalloc will memcpy() from that per-CPU buffer to physical pages that object in question spans across. New API splits functions by access mode: - zs_obj_read_begin(handle, local_copy) Returns a pointer to handle memory. For objects that span two physical pages a local_copy buffer is used to store object's data before the address is returned to the caller. Otherwise the object's page is kmap_local mapped directly. - zs_obj_read_end(handle, buf) Unmaps the page if it was kmap_local mapped by zs_obj_read_begin(). - zs_obj_write(handle, buf, len) Copies len-bytes from compression buffer to handle memory (takes care of objects that span two pages). This does not need any additional (e.g. per-CPU) buffers and writes the data directly to zsmalloc pool pages. In terms of performance, on a synthetic and completely reproducible test that allocates fixed number of objects of fixed sizes and iterates over those objects, first mapping in RO then in RW mode: OLD API ======= 3 first results out of 10 369,205,778 instructions # 0.80 insn per cycle 40,467,926 branches # 113.732 M/sec 369,002,122 instructions # 0.62 insn per cycle 40,426,145 branches # 189.361 M/sec 369,036,706 instructions # 0.63 insn per cycle 40,430,860 branches # 204.105 M/sec [..] NEW API ======= 3 first results out of 10 265,799,293 instructions # 0.51 insn per cycle 29,834,567 branches # 170.281 M/sec 265,765,970 instructions # 0.55 insn per cycle 29,829,019 branches # 161.602 M/sec 265,764,702 instructions # 0.51 insn per cycle 29,828,015 branches # 189.677 M/sec [..] T-test on all 10 runs ===================== Difference at 95.0% confidence -1.03219e+08 +/- 55308.7 -27.9705% +/- 0.0149878% (Student's t, pooled s = 58864.4) The old API will stay around until the remaining users switch to the new one. After that we'll also remove zsmalloc per-CPU buffer and CPU hotplug handling. The split of map(RO) and map(WO) into read_{begin/end}/write is suggested by Yosry Ahmed. Link: https://lkml.kernel.org/r/20250303022425.285971-15-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Yosry Ahmed Reviewed-by: Yosry Ahmed Cc: Hillf Danton Cc: Kairui Song Cc: Minchan Kim Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/zsmalloc.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index a48cd0ffe57d..7d70983cf398 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -58,4 +58,12 @@ unsigned long zs_compact(struct zs_pool *pool); unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); + +void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, + void *local_copy); +void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, + void *handle_mem); +void zs_obj_write(struct zs_pool *pool, unsigned long handle, + void *handle_mem, size_t mem_len); + #endif -- cgit v1.2.3 From 1b7e90020eb770aa52991a34f21552b4c38ea690 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 14 Mar 2025 00:59:33 +0800 Subject: mm, swap: use percpu cluster as allocation fast path Current allocation workflow first traverses the plist with a global lock held, after choosing a device, it uses the percpu cluster on that swap device. This commit moves the percpu cluster variable out of being tied to individual swap devices, making it a global percpu variable, and will be used directly for allocation as a fast path. The global percpu cluster variable will never point to a HDD device, and allocations on a HDD device are still globally serialized. This improves the allocator performance and prepares for removal of the slot cache in later commits. There shouldn't be much observable behavior change, except one thing: this changes how swap device allocation rotation works. Currently, each allocation will rotate the plist, and because of the existence of slot cache (one order 0 allocation usually returns 64 entries), swap devices of the same priority are rotated for every 64 order 0 entries consumed. High order allocations are different, they will bypass the slot cache, and so swap device is rotated for every 16K, 32K, or up to 2M allocation. The rotation rule was never clearly defined or documented, it was changed several times without mentioning. After this commit, and once slot cache is gone in later commits, swap device rotation will happen for every consumed cluster. Ideally non-HDD devices will be rotated if 2M space has been consumed for each order. Fragmented clusters will rotate the device faster, which seems OK. HDD devices is rotated for every allocation regardless of the allocation order, which should be OK too and trivial. This commit also slightly changes allocation behaviour for slot cache. The new added cluster allocation fast path may allocate entries from different device to the slot cache, this is not observable from user space, only impact performance very slightly, and slot cache will be just gone in next commit, so this can be ignored. Link: https://lkml.kernel.org/r/20250313165935.63303-6-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kalesh Singh Cc: Matthew Wilcow (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe91c293636..374bffc87427 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -284,12 +284,10 @@ enum swap_cluster_flags { #endif /* - * We assign a cluster to each CPU, so each CPU can allocate swap entry from - * its own cluster and swapout sequentially. The purpose is to optimize swapout - * throughput. + * We keep using same cluster for rotational device so IO will be sequential. + * The purpose is to optimize SWAP throughput on these device. */ -struct percpu_cluster { - local_lock_t lock; /* Protect the percpu_cluster above */ +struct swap_sequential_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -315,8 +313,7 @@ struct swap_info_struct { atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ - struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ - struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ -- cgit v1.2.3 From 0ff67f990bd45726e0d9e91111d998e7a3595b32 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 14 Mar 2025 00:59:34 +0800 Subject: mm, swap: remove swap slot cache Slot cache is no longer needed now, removing it and all related code. - vm-scalability with: `usemem --init-time -O -y -x -R -31 1G`, 12G memory cgroup using simulated pmem as SWAP (32G pmem, 32 CPUs), 16 test runs for each case, measuring the total throughput: Before (KB/s) (stdev) After (KB/s) (stdev) Random (4K): 424907.60 (24410.78) 414745.92 (34554.78) Random (64K): 163308.82 (11635.72) 167314.50 (18434.99) Sequential (4K, !-R): 6150056.79 (103205.90) 6321469.06 (115878.16) The performance changes are below noise level. - Build linux kernel with make -j96, using 4K folio with 1.5G memory cgroup limit and 64K folio with 2G memory cgroup limit, on top of tmpfs, 12 test runs, measuring the system time: Before (s) (stdev) After (s) (stdev) make -j96 (4K): 6445.69 (61.95) 6408.80 (69.46) make -j96 (64K): 6841.71 (409.04) 6437.99 (435.55) Similar to above, 64k mTHP case showed a slight improvement. Link: https://lkml.kernel.org/r/20250313165935.63303-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baoquan He Cc: Baolin Wang Cc: Barry Song Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kalesh Singh Cc: Matthew Wilcow (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 --- include/linux/swap_slots.h | 28 ---------------------------- 2 files changed, 31 deletions(-) delete mode 100644 include/linux/swap_slots.h (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 374bffc87427..c5856dcc263a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -465,7 +465,6 @@ void free_pages_and_swap_cache(struct encoded_page **, int); extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; -extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) @@ -483,13 +482,11 @@ swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); -extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h deleted file mode 100644 index 840aec3523b2..000000000000 --- a/include/linux/swap_slots.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SWAP_SLOTS_H -#define _LINUX_SWAP_SLOTS_H - -#include -#include -#include - -#define SWAP_SLOTS_CACHE_SIZE SWAP_BATCH -#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE (5*SWAP_SLOTS_CACHE_SIZE) -#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE (2*SWAP_SLOTS_CACHE_SIZE) - -struct swap_slots_cache { - bool lock_initialized; - struct mutex alloc_lock; /* protects slots, nr, cur */ - swp_entry_t *slots; - int nr; - int cur; - int n_ret; -}; - -void disable_swap_slots_cache_lock(void); -void reenable_swap_slots_cache_unlock(void); -void enable_swap_slots_cache(void); - -extern bool swap_slot_cache_enabled; - -#endif /* _LINUX_SWAP_SLOTS_H */ -- cgit v1.2.3 From b487a2da3575b6cdfb6d6559311830c8fea70bb9 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 14 Mar 2025 00:59:35 +0800 Subject: mm, swap: simplify folio swap allocation With slot cache gone, clean up the allocation helpers even more. folio_alloc_swap will be the only entry for allocation and adding the folio to swap cache (except suspend), making it opposite of folio_free_swap. Link: https://lkml.kernel.org/r/20250313165935.63303-8-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kalesh Singh Cc: Matthew Wilcow (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index c5856dcc263a..9c99eee160f9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -478,7 +478,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -swp_entry_t folio_alloc_swap(struct folio *folio); +int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); @@ -586,11 +586,9 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline swp_entry_t folio_alloc_swap(struct folio *folio) +static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) { - swp_entry_t entry; - entry.val = 0; - return entry; + return -EINVAL; } static inline bool folio_free_swap(struct folio *folio) -- cgit v1.2.3 From 7b54a96f30dec4b812841d179a26e88a7887dc06 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Fri, 31 Jan 2025 17:08:25 +0530 Subject: crash: remove an unused argument from reserve_crashkernel_generic() cmdline argument is not used in reserve_crashkernel_generic() so remove it. Correspondingly, all the callers have been updated as well. No functional change intended. Link: https://lkml.kernel.org/r/20250131113830.925179-3-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Acked-by: Hari Bathini Acked-by: Baoquan He Cc: Madhavan Srinivasan Cc: Mahesh Salgaonkar Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/crash_reserve.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index 5a9df944fb80..1fe7e7d1b214 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -32,13 +32,12 @@ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, #define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() #endif -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high); +void __init reserve_crashkernel_generic(unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high); #else -static inline void __init reserve_crashkernel_generic(char *cmdline, +static inline void __init reserve_crashkernel_generic( unsigned long long crash_size, unsigned long long crash_base, unsigned long long crash_low_size, -- cgit v1.2.3 From 9f0552c978f8950b4661f1222290fb57af811a8b Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Fri, 31 Jan 2025 17:08:26 +0530 Subject: crash: let arch decide usable memory range in reserved area Although the crashkernel area is reserved, on architectures like PowerPC, it is possible for the crashkernel reserved area to contain components like RTAS, TCE, OPAL, etc. To avoid placing kexec segments over these components, PowerPC has its own set of APIs to locate holes in the crashkernel reserved area. Add an arch hook in the generic locate mem hole APIs so that architectures can handle such special regions in the crashkernel area while locating memory holes for kexec segments using generic APIs. With this, a lot of redundant arch-specific code can be removed, as it performs the exact same job as the generic APIs. To keep the generic and arch-specific changes separate, the changes related to moving PowerPC to use the generic APIs and the removal of PowerPC-specific APIs for memory hole allocation are done in a subsequent patch titled "powerpc/crash: Use generic APIs to locate memory hole for kdump. Link: https://lkml.kernel.org/r/20250131113830.925179-4-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Acked-by: Baoquan He Cc: Hari Bathini Cc: Madhavan Srinivasan Cc: Mahesh Salgaonkar Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/kexec.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0e9f8eda7a3..407f8b0346aa 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -205,6 +205,15 @@ static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif +#ifndef arch_check_excluded_range +static inline int arch_check_excluded_range(struct kimage *image, + unsigned long start, + unsigned long end) +{ + return 0; +} +#endif + #ifdef CONFIG_KEXEC_SIG #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len); -- cgit v1.2.3 From 8c6bbda879b62f16bb03321a84554b4f63415c55 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 3 Feb 2025 16:05:22 +0100 Subject: rcu: provide a static initializer for hlist_nulls_head Patch series "ucount: Simplify refcounting with rcuref_t". I noticed that the atomic_dec_and_lock_irqsave() in put_ucounts() loops sometimes even during boot. Something like 2-3 iterations but still. This series replaces the refcounting with rcuref_t and adds a RCU lookup. This allows a lockless lookup in alloc_ucounts() if the entry is available and a cmpxchg()less put of the item. This patch (of 4): Provide a static initializer for hlist_nulls_head so that it can be used in statically defined data structures. Link: https://lkml.kernel.org/r/20250203150525.456525-1-bigeasy@linutronix.de Link: https://lkml.kernel.org/r/20250203150525.456525-2-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Steven Rostedt Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: "Paul E . McKenney" Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/list_nulls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index fa6e8471bd22..248db9b77ee2 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -28,6 +28,7 @@ struct hlist_nulls_node { #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) +#define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)} #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) -- cgit v1.2.3 From 5f01a22c5b231dd590f61a2591b3090665733bcb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 3 Feb 2025 16:05:24 +0100 Subject: ucount: use RCU for ucounts lookups The ucounts element is looked up under ucounts_lock. This can be optimized by using RCU for a lockless lookup and return and element if the reference can be obtained. Replace hlist_head with hlist_nulls_head which is RCU compatible. Let find_ucounts() search for the required item within a RCU section and return the item if a reference could be obtained. This means alloc_ucounts() will always return an element (unless the memory allocation failed). Let put_ucounts() RCU free the element if the reference counter dropped to zero. Link: https://lkml.kernel.org/r/20250203150525.456525-4-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: Steven Rostedt Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/user_namespace.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 7183e5aca282..ad4dbef92597 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -115,9 +116,10 @@ struct user_namespace { } __randomize_layout; struct ucounts { - struct hlist_node node; + struct hlist_nulls_node node; struct user_namespace *ns; kuid_t uid; + struct rcu_head rcu; atomic_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; -- cgit v1.2.3 From b4dc0bee2a749083028afba346910e198653f42a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 3 Feb 2025 16:05:25 +0100 Subject: ucount: use rcuref_t for reference counting Use rcuref_t for reference counting. This eliminates the cmpxchg loop in the get and put path. This also eliminates the need to acquire the lock in the put path because once the final user returns the reference, it can no longer be obtained anymore. Use rcuref_t for reference counting. Link: https://lkml.kernel.org/r/20250203150525.456525-5-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Boqun Feng Cc: Joel Fernandes Cc: Josh Triplett Cc: Lai jiangshan Cc: Mathieu Desnoyers Cc: Mengen Sun Cc: Steven Rostedt Cc: "Uladzislau Rezki (Sony)" Cc: YueHong Wu Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/user_namespace.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index ad4dbef92597..a0bb6d012137 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -120,7 +121,7 @@ struct ucounts { struct user_namespace *ns; kuid_t uid; struct rcu_head rcu; - atomic_t count; + rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; @@ -133,9 +134,15 @@ void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); -struct ucounts * __must_check get_ucounts(struct ucounts *ucounts); void put_ucounts(struct ucounts *ucounts); +static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) +{ + if (rcuref_get(&ucounts->count)) + return ucounts; + return NULL; +} + static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); -- cgit v1.2.3 From 318f05a057157c400a92f17c5f2fa3dd96f57c7e Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 17 Feb 2025 21:39:41 +0100 Subject: reboot: replace __hw_protection_shutdown bool action parameter with an enum Patch series "reboot: support runtime configuration of emergency hw_protection action", v3. We currently leave the decision of whether to shutdown or reboot to protect hardware in an emergency situation to the individual drivers. This works out in some cases, where the driver detecting the critical failure has inside knowledge: It binds to the system management controller for example or is guided by hardware description that defines what to do. This is inadequate in the general case though as a driver reporting e.g. an imminent power failure can't know whether a shutdown or a reboot would be more appropriate for a given hardware platform. To address this, this series adds a hw_protection kernel parameter and sysfs toggle that can be used to change the action from the shutdown default to reboot. A new hw_protection_trigger API then makes use of this default action. My particular use case is unattended embedded systems that don't have support for shutdown and that power on automatically when power is supplied: - A brief power cycle gets detected by the driver - The kernel powers down the system and SoC goes into shutdown mode - Power is restored - The system remains oblivious to the restored power - System needs to be manually power cycled for a duration long enough to drain the capacitors With this series, such systems can configure the kernel with hw_protection=reboot to have the boot firmware worry about critical conditions. This patch (of 12): Currently __hw_protection_shutdown() either reboots or shuts down the system according to its shutdown argument. To make the logic easier to follow, both inside __hw_protection_shutdown and at caller sites, lets replace the bool parameter with an enum. This will be extra useful, when in a later commit, a third action is added to the enumeration. No functional change. Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-0-e1c09b090c0c@pengutronix.de Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-1-e1c09b090c0c@pengutronix.de Signed-off-by: Ahmad Fatoum Reviewed-by: Tzung-Bi Shih Cc: Benson Leung Cc: Mark Brown Cc: Daniel Lezcano Cc: Fabio Estevam Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Liam Girdwood Cc: Lukasz Luba Cc: Matteo Croce Cc: Matti Vaittinen Cc: "Rafael J. Wysocki" Cc: Rob Herring Cc: Rui Zhang Cc: Sascha Hauer Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/reboot.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index abcdde4df697..e97f6b8e8586 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -177,16 +177,28 @@ void ctrl_alt_del(void); extern void orderly_poweroff(bool force); extern void orderly_reboot(void); -void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown); + +/** + * enum hw_protection_action - Hardware protection action + * + * @HWPROT_ACT_SHUTDOWN: + * The system should be shut down (powered off) for HW protection. + * @HWPROT_ACT_REBOOT: + * The system should be rebooted for HW protection. + */ +enum hw_protection_action { HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT }; + +void __hw_protection_shutdown(const char *reason, int ms_until_forced, + enum hw_protection_action action); static inline void hw_protection_reboot(const char *reason, int ms_until_forced) { - __hw_protection_shutdown(reason, ms_until_forced, false); + __hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_REBOOT); } static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) { - __hw_protection_shutdown(reason, ms_until_forced, true); + __hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN); } /* -- cgit v1.2.3 From 81cab0f94ab864f13e29e561382d722daec6a55a Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 17 Feb 2025 21:39:45 +0100 Subject: reboot: rename now misleading __hw_protection_shutdown symbols The __hw_protection_shutdown function name has become misleading since it can cause either a shutdown (poweroff) or a reboot depending on its argument. To avoid further confusion, let's rename it, so it doesn't suggest that a poweroff is all it can do. Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-5-e1c09b090c0c@pengutronix.de Signed-off-by: Ahmad Fatoum Reviewed-by: Tzung-Bi Shih Cc: Benson Leung Cc: Daniel Lezcano Cc: Fabio Estevam Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Liam Girdwood Cc: Lukasz Luba Cc: Mark Brown Cc: Matteo Croce Cc: Matti Vaittinen Cc: "Rafael J. Wysocki" Cc: Rob Herring (Arm) Cc: Rui Zhang Cc: Sascha Hauer Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/reboot.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index e97f6b8e8586..53c64e31b3cf 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -188,17 +188,17 @@ extern void orderly_reboot(void); */ enum hw_protection_action { HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT }; -void __hw_protection_shutdown(const char *reason, int ms_until_forced, - enum hw_protection_action action); +void __hw_protection_trigger(const char *reason, int ms_until_forced, + enum hw_protection_action action); static inline void hw_protection_reboot(const char *reason, int ms_until_forced) { - __hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_REBOOT); + __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_REBOOT); } static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) { - __hw_protection_shutdown(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN); + __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN); } /* -- cgit v1.2.3 From e016173f656b89f5f71b7d45dfc599ada8107eef Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 17 Feb 2025 21:39:47 +0100 Subject: reboot: add support for configuring emergency hardware protection action We currently leave the decision of whether to shutdown or reboot to protect hardware in an emergency situation to the individual drivers. This works out in some cases, where the driver detecting the critical failure has inside knowledge: It binds to the system management controller for example or is guided by hardware description that defines what to do. In the general case, however, the driver detecting the issue can't know what the appropriate course of action is and shouldn't be dictating the policy of dealing with it. Therefore, add a global hw_protection toggle that allows the user to specify whether shutdown or reboot should be the default action when the driver doesn't set policy. This introduces no functional change yet as hw_protection_trigger() has no callers, but these will be added in subsequent commits. [arnd@arndb.de: hide unused hw_protection_attr] Link: https://lkml.kernel.org/r/20250224141849.1546019-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-7-e1c09b090c0c@pengutronix.de Signed-off-by: Ahmad Fatoum Reviewed-by: Tzung-Bi Shih Cc: Benson Leung Cc: Daniel Lezcano Cc: Fabio Estevam Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Liam Girdwood Cc: Lukasz Luba Cc: Mark Brown Cc: Matteo Croce Cc: Matti Vaittinen Cc: "Rafael J. Wysocki" Cc: Rob Herring (Arm) Cc: Rui Zhang Cc: Sascha Hauer Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/reboot.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 53c64e31b3cf..79e02876f2ba 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -181,16 +181,36 @@ extern void orderly_reboot(void); /** * enum hw_protection_action - Hardware protection action * + * @HWPROT_ACT_DEFAULT: + * The default action should be taken. This is HWPROT_ACT_SHUTDOWN + * by default, but can be overridden. * @HWPROT_ACT_SHUTDOWN: * The system should be shut down (powered off) for HW protection. * @HWPROT_ACT_REBOOT: * The system should be rebooted for HW protection. */ -enum hw_protection_action { HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT }; +enum hw_protection_action { HWPROT_ACT_DEFAULT, HWPROT_ACT_SHUTDOWN, HWPROT_ACT_REBOOT }; void __hw_protection_trigger(const char *reason, int ms_until_forced, enum hw_protection_action action); +/** + * hw_protection_trigger - Trigger default emergency system hardware protection action + * + * @reason: Reason of emergency shutdown or reboot to be printed. + * @ms_until_forced: Time to wait for orderly shutdown or reboot before + * triggering it. Negative value disables the forced + * shutdown or reboot. + * + * Initiate an emergency system shutdown or reboot in order to protect + * hardware from further damage. The exact action taken is controllable at + * runtime and defaults to shutdown. + */ +static inline void hw_protection_trigger(const char *reason, int ms_until_forced) +{ + __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_DEFAULT); +} + static inline void hw_protection_reboot(const char *reason, int ms_until_forced) { __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_REBOOT); -- cgit v1.2.3 From 5e40d0d196595874c3f0606f0642021e6ade8054 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 17 Feb 2025 21:39:52 +0100 Subject: reboot: retire hw_protection_reboot and hw_protection_shutdown helpers The hw_protection_reboot and hw_protection_shutdown functions mix mechanism with policy: They let the driver requesting an emergency action for hardware protection also decide how to deal with it. This is inadequate in the general case as a driver reporting e.g. an imminent power failure can't know whether a shutdown or a reboot would be more appropriate for a given hardware platform. With the addition of the hw_protection parameter, it's now possible to configure at runtime the default emergency action and drivers are expected to use hw_protection_trigger to have this parameter dictate policy. As no current users of either hw_protection_shutdown or hw_protection_shutdown helpers remain, remove them, as not to tempt driver authors to call them. Existing users now either defer to hw_protection_trigger or call __hw_protection_trigger with a suitable argument directly when they have inside knowledge on whether a reboot or shutdown would be more appropriate. Link: https://lkml.kernel.org/r/20250217-hw_protection-reboot-v3-12-e1c09b090c0c@pengutronix.de Signed-off-by: Ahmad Fatoum Reviewed-by: Tzung-Bi Shih Cc: Benson Leung Cc: Daniel Lezcano Cc: Fabio Estevam Cc: Guenter Roeck Cc: Jonathan Corbet Cc: Liam Girdwood Cc: Lukasz Luba Cc: Mark Brown Cc: Matteo Croce Cc: Matti Vaittinen Cc: "Rafael J. Wysocki" Cc: Rob Herring (Arm) Cc: Rui Zhang Cc: Sascha Hauer Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/reboot.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 79e02876f2ba..aa08c3bbbf59 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -211,16 +211,6 @@ static inline void hw_protection_trigger(const char *reason, int ms_until_forced __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_DEFAULT); } -static inline void hw_protection_reboot(const char *reason, int ms_until_forced) -{ - __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_REBOOT); -} - -static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) -{ - __hw_protection_trigger(reason, ms_until_forced, HWPROT_ACT_SHUTDOWN); -} - /* * Emergency restart, callable from an interrupt handler. */ -- cgit v1.2.3 From 0ac451ecec6dd516fdac1ca064467e753ef6ae1a Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 16 Feb 2025 00:56:18 +0800 Subject: lib min_heap: use size_t for array size and index variables Replace the int type with size_t for variables representing array sizes and indices in the min-heap implementation. Using size_t aligns with standard practices for size-related variables and avoids potential issues on platforms where int may be insufficient to represent all valid sizes or indices. Link: https://lkml.kernel.org/r/20250215165618.1757219-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: Yu-Chun Lin Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 1160bed6579e..79ddc0adbf2b 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -218,7 +218,7 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size) /* Initialize a min-heap. */ static __always_inline -void __min_heap_init_inline(min_heap_char *heap, void *data, int size) +void __min_heap_init_inline(min_heap_char *heap, void *data, size_t size) { heap->nr = 0; heap->size = size; @@ -254,7 +254,7 @@ bool __min_heap_full_inline(min_heap_char *heap) /* Sift the element at pos down the heap. */ static __always_inline -void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, +void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size, const struct min_heap_callbacks *func, void *args) { const unsigned long lsbit = elem_size & -elem_size; @@ -324,7 +324,7 @@ static __always_inline void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, const struct min_heap_callbacks *func, void *args) { - int i; + ssize_t i; for (i = heap->nr / 2 - 1; i >= 0; i--) __min_heap_sift_down_inline(heap, i, elem_size, func, args); @@ -379,7 +379,7 @@ bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t ele const struct min_heap_callbacks *func, void *args) { void *data = heap->data; - int pos; + size_t pos; if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap")) return false; @@ -428,10 +428,10 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, __min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ __minheap_obj_size(_heap), _idx, _func, _args) -void __min_heap_init(min_heap_char *heap, void *data, int size); +void __min_heap_init(min_heap_char *heap, void *data, size_t size); void *__min_heap_peek(struct min_heap_char *heap); bool __min_heap_full(min_heap_char *heap); -void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, +void __min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size, const struct min_heap_callbacks *func, void *args); void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, const struct min_heap_callbacks *func, void *args); -- cgit v1.2.3 From 758502918e77323bf999a3eddd71466bf6135173 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 21 Feb 2025 05:02:21 -0800 Subject: rhashtable: remove needless return in three void APIs Remove needless 'return' in the following void APIs: rhltable_walk_enter() rhltable_free_and_destroy() rhltable_destroy() Since both the API and callee involved are void functions. Link: https://lkml.kernel.org/r/20250221-rmv_return-v1-16-cc8dff275827@quicinc.com Signed-off-by: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/rhashtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 8463a128e2f4..6c85b28ea30b 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1259,7 +1259,7 @@ static inline int rhashtable_replace_fast( static inline void rhltable_walk_enter(struct rhltable *hlt, struct rhashtable_iter *iter) { - return rhashtable_walk_enter(&hlt->ht, iter); + rhashtable_walk_enter(&hlt->ht, iter); } /** @@ -1275,12 +1275,12 @@ static inline void rhltable_free_and_destroy(struct rhltable *hlt, void *arg), void *arg) { - return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); + rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); } static inline void rhltable_destroy(struct rhltable *hlt) { - return rhltable_free_and_destroy(hlt, NULL, NULL); + rhltable_free_and_destroy(hlt, NULL, NULL); } #endif /* _LINUX_RHASHTABLE_H */ -- cgit v1.2.3 From ede7cd607a1d206b9579264a7405d78316b2d7fd Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 21 Feb 2025 05:02:07 -0800 Subject: cpu: remove needless return in void API suspend_enable_secondary_cpus() Remove needless 'return' in void API suspend_enable_secondary_cpus() since both the API and thaw_secondary_cpus() are void functions. Link: https://lkml.kernel.org/r/20250221-rmv_return-v1-2-cc8dff275827@quicinc.com Signed-off-by: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/cpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6a0a8f1c7c90..e3049543008b 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -148,7 +148,7 @@ static inline int suspend_disable_secondary_cpus(void) } static inline void suspend_enable_secondary_cpus(void) { - return thaw_secondary_cpus(); + thaw_secondary_cpus(); } #else /* !CONFIG_PM_SLEEP_SMP */ -- cgit v1.2.3 From 720ba85040a6549674e5599685ca7df86a902448 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 26 Feb 2025 14:22:57 +0100 Subject: mm/mmu_notifier: use MMU_NOTIFY_CLEAR in remove_device_exclusive_entry() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's limit the use of MMU_NOTIFY_EXCLUSIVE to the case where we convert a present PTE to device-exclusive. For the other case, we can simply use MMU_NOTIFY_CLEAR, because it really is clearing the device-exclusive entry first, to then install the present entry. Update the documentation of MMU_NOTIFY_EXCLUSIVE, to document the single use case more thoroughly. If ever required, we could add a separate MMU_NOTIFY_CLEAR_EXCLUSIVE; for now using MMU_NOTIFY_CLEAR seems to be sufficient. Link: https://lkml.kernel.org/r/20250226132257.2826043-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Alistair Popple Cc: Jason Gunthorpe Cc: Jérôme Glisse Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d4e714661826..bc2402a45741 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -43,10 +43,10 @@ struct mmu_interval_notifier; * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. * - * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no - * longer have exclusive access to the page. When sent during creation of an - * exclusive range the owner will be initialised to the value provided by the - * caller of make_device_exclusive(), otherwise the owner will be NULL. + * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive. + * The owner is initialized to the value provided by the caller of + * make_device_exclusive(), such that this caller can filter out these + * events. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, -- cgit v1.2.3 From 1eb3471bf5749ff3769ec52723bd9b8d773c7a62 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:19 -0800 Subject: mm/damon: add data structure for monitoring intervals auto-tuning Patch series "mm/damon: auto-tune aggregation interval". DAMON requires time-consuming and repetitive aggregation interval tuning. Introduce a feature for automating it using a feedback loop that aims an amount of observed access events, like auto-exposing cameras. Background: Access Frequency Monitoring and Aggregation Interval ================================================================ DAMON checks if each memory element (damon_region) is accessed or not for every user-specified time interval called 'sampling interval'. It aggregates the check intervals on per-element counter called 'nr_accesses'. DAMON users can read the counters to get the access temperature of a given element. The counters are reset for every another user-specified time interval called 'aggregation interval'. This can be illustrated as DAMON continuously capturing a snapshot of access events that happen and captured within the last aggregation interval. This implies the aggregation interval plays a key role for the quality of the snapshots, like the camera exposure time. If it is too short, the amount of access events that happened and captured for each snapshot is small, so each snapshot will show no many interesting things but just a cold and dark world with hopefuly one pale blue dot or two. If it is too long, too many events are aggregated in a single shot, so each snapshot will look like world of flames, or Muspellheim. It will be difficult to find practical insights in both cases. Problem: Time Consuming and Repetitive Tuning ============================================= The appropriate length of the aggregation interval depends on how frequently the system and workloads are making access events that DAMON can observe. Hence, users have to tune the interval with excessive amount of tests with the target system and workloads. If the system and workloads are changed, the tuning should be done again. If the characteristic of the workloads is dynamic, it becomes more challenging. It is therefore time-consuming and repetitive. The tuning challenge mainly stems from the wrong question. It is not asking users what quality of monitoring results they want, but how DAMON should operate for their hidden goal. To make the right answer, users need to fully understand DAMON's mechanisms and the characteristics of their workloads. Users shouldn't be asked to understand the underlying mechanism. Understanding the characteristics of the workloads shouldn't be the role of users but DAMON. Aim-oriented Feedback-driven Auto-Tuning ========================================= Fortunately, the appropriate length of the aggregation interval can be inferred using a feedback loop. If the current snapshots are showing no much intresting information, in other words, if it shows only rare access events, increasing the aggregation interval helps, and vice versa. We tested this theory on a few real-world workloads, and documented one of the experience with an official DAMON monitoring intervals tuning guideline. Since it is a simple theory that requires repeatable tries, it can be a good job for machines. Based on the guideline's theory, we design an automation of aggregation interval tuning, in a way similar to that of camera auto-exposure feature. It defines the amount of interesting information as the ratio of DAMON-observed access events that DAMON actually observed to theoretical maximum amount of it within each snapshot. Events are accounted in byte and sampling attempts granularity. For example, let's say there is a region of 'X' bytes size. DAMON tried access check smapling for the region 'Y' times in total for a given aggregation. Among the 'Y' attempts, 'Z' times it shown positive results. Then, the theoritical maximum number of access events for the region is 'X * Y'. And the number of access events that DAMON has observed for the region is 'X * Z'. The abount of the interesting information is '(X * Z / X * Y)'. Note that each snapshot would have multiple regions. Users can set an arbitrary value of the ratio as their target. Once the target is set, the automation periodically measures the current value of the ratio and increase or decrease the aggregation interval if the ratio value is lower or higher than the target. The amount of the change is proportion to the distance between the current adn the target values. To avoid auto-tuning goes too long way, let users set the minimum and the maximum aggregation interval times. Changing only aggregation interval while sampling interval is kept makes the maximum level of access frequency in each snapshot, or discernment of regions inconsistent. Also, unnecessarily short sampling interval causes meaningless monitoring overhed. The automation therefore adjusts the sampling interval together with aggregation interval, while keeping the ratio between the two intervals. Users can set the ratio, or the discernment. Discussion ========== The modified question (aimed amount of access events, or lights, in each snapshot) is easy to answer by both the users and the kernel. If users are interested in finding more cold regions, the value should be lower, and vice versa. If users have no idea, kernel can suggest a fair default value based on some theories and experiments. For example, based on the Pareto principle (80/20 rule), we could expect 20% target ratio will capture 80% of real access events. Since 80% might be too high, applying the rule once again, 4% (20% * 20%) may capture about 56% (80% * 80%) of real access events. Sampling to aggregation intervals ratio and min/max aggregation intervals are also arguably easy to answer. What users want is discernment of regions for efficient system operation, for examples, X amount of colder regions or Y amount of warmer regions, not exactly how many times each cache line is accessed in nanoseconds degree. The appropriate min/max aggregation interval can relatively naively set, and may better to set for aimed monitoring overhead. Since sampling interval is directly deciding the overhead, setting it based on the sampling interval can be easy. With my experiences, I'd argue the intervals ratio 0.05, and 5 milliseconds to 20 seconds sampling interval range (100 milliseconds to 400 seconds aggregation interval) can be a good default suggestion. Evaluation ========== On a machine running a real world server workload, I ran DAMON to monitor its physical address space for about 23 hours, with this feature turned on. We set it to tune sampling interval in a range from 5 milliseconds to 10 seconds, aiming 4 % DAMON-observed access ratio per three aggregation intervals. The exact command I used is as below. damo start --monitoring_intervals_goal 4% 3 5ms 10s --damos_action stat During the test run, DAMON continuously updated sampling and aggregation intervals as designed, within the given range. For all the time, DAMON was able to find the intervals that meets the target access events ratio in the given intervals range (sampling interval between 5 milliseconds and 10 seconds). For most of the time, tuned sampling interval was converged in 300-400 milliseconds. It made only small amount of changes within the range. The average of the tuned sampling interval during the test was about 380 milliseconds. The workload periodically gets less load and decreases its CPU usage. Presumably this also caused it making less memory access events. Reactively to such event,s DAMON also increased the intervals as expected. It was still able to find the optimum interval that satisfying the target access ratio within the given intervals range. Usually it was converged to about 5 seconds. Once the workload gets normal amount of load again, DAMON reactively reduced the intervals to the normal range. I collected and visualized DAMON's monitoring results on the server a few times. Every time the visualized access pattern looked not biased to only cold or hot pages but diverse and balanced. Let me show some of the snapshots that I collected at the nearly end of the test (after about 23 hours have passed since starting DAMON on the server). The recency histogram looks as below. Please note that this visualization shows only a very coarse grained information. For more details about the visualization format, please refer to DAMON user-space tool documentation[1]. # ./damo report access --style recency-sz-hist --tried_regions_of 0 0 0 --access_rate 0 0 [-19 h 7 m 45.514 s, -17 h 12 m 58.963 s) 6.198 GiB |**** | [-17 h 12 m 58.963 s, -15 h 18 m 12.412 s) 0 B | | [-15 h 18 m 12.412 s, -13 h 23 m 25.860 s) 0 B | | [-13 h 23 m 25.860 s, -11 h 28 m 39.309 s) 0 B | | [-11 h 28 m 39.309 s, -9 h 33 m 52.757 s) 0 B | | [-9 h 33 m 52.757 s, -7 h 39 m 6.206 s) 0 B | | [-7 h 39 m 6.206 s, -5 h 44 m 19.654 s) 0 B | | [-5 h 44 m 19.654 s, -3 h 49 m 33.103 s) 0 B | | [-3 h 49 m 33.103 s, -1 h 54 m 46.551 s) 0 B | | [-1 h 54 m 46.551 s, -0 ns) 16.967 GiB |********* | [-0 ns, --6886551440000 ns) 38.835 GiB |********************| memory bw estimate: 9.425 GiB per second total size: 62.000 GiB It shows about 38 GiB of memory was accessed at least once within last aggregation interval (given ~300 milliseconds tuned sampling interval, this is about six seconds). This is about 61 % of the total memory. In other words, DAMON found warmest 61 % memory of the system. The number is particularly interesting given our Pareto principle based theory for the tuning goal value. We set it as 20 % of 20 % (4 %), thinking it would capture 80 % of 80 % (64 %) real access events. And it foudn 61 % hot memory, or working set. Nevertheless, to make the theory clearer, much more discussion and tests would be needed. At the moment, nonetheless, we can say making the target value higher helps finding more hot memory regions. The histogram also shows an amount of cold memory. About 17 GiB memory of the system has not accessed at least for last aggregation interval (about six seconds), and at most for about last two hours. The real longest unaccessed time of the 17 GiB memory was about 19 minutes, though. This is a limitation of this visualization format. It further found very cold 6 GiB memory. It has not accessed at least for last 17 hours and at most 19 hours. What about hot memory distribution? To see this, I capture and visualize the snapshot in access temperature histogram. Again, please refer to the DAMON user-space tool documentation[1] for the format and what access temperature mean. Both the visualization and metric shows only very coarse grained and limited information. The resulting histogram look like below. # ./damo report access --style temperature-sz-hist --tried_regions_of 0 0 0 [-6,840,763,776,000, -5,501,580,939,800) 6.198 GiB |*** | [-5,501,580,939,800, -4,162,398,103,600) 0 B | | [-4,162,398,103,600, -2,823,215,267,400) 0 B | | [-2,823,215,267,400, -1,484,032,431,200) 0 B | | [-1,484,032,431,200, -144,849,595,000) 0 B | | [-144,849,595,000, 1,194,333,241,200) 55.802 GiB |********************| [1,194,333,241,200, 2,533,516,077,400) 4.000 KiB |* | [2,533,516,077,400, 3,872,698,913,600) 4.000 KiB |* | [3,872,698,913,600, 5,211,881,749,800) 8.000 KiB |* | [5,211,881,749,800, 6,551,064,586,000) 12.000 KiB |* | [6,551,064,586,000, 7,890,247,422,200) 4.000 KiB |* | memory bw estimate: 5.178 GiB per second total size: 62.000 GiB We can see most of the memory is in similar access temperature range, and definitely some pages are extremely hot. To see the picture in more detail, let's capture and visualize the snapshot per DAMON-region, sorted by their access temperature. The total number of the regions was about 300. Due to the limited space, I'm showing only a few parts of the output here. # ./damo report access --style hot --tried_regions_of 0 0 0 heatmap: 00000000888888889999999888888888888888888888888888888888888888888888888888888888 # min/max temperatures: -6,827,258,184,000, 17,589,052,500, column size: 793.600 MiB |999999999999999999999999999999999999999| 4.000 KiB access 100 % 18 h 9 m 43.918 s |999999999999999999999999999999999999999| 8.000 KiB access 100 % 17 h 56 m 5.351 s |999999999999999999999999999999999999999| 4.000 KiB access 100 % 15 h 24 m 19.634 s |999999999999999999999999999999999999999| 4.000 KiB access 100 % 14 h 10 m 55.606 s |999999999999999999999999999999999999999| 4.000 KiB access 100 % 11 h 34 m 18.993 s [...] |99999999999999999999999999999| 8.000 KiB access 100 % 1 m 27.945 s |11111111111111111111111111111| 80.000 KiB access 15 % 1 m 21.180 s |00000000000000000000000000000| 24.000 KiB access 5 % 1 m 21.180 s |00000000000000000000000000000| 5.919 GiB access 10 % 1 m 14.415 s |99999999999999999999999999999| 12.000 KiB access 100 % 1 m 7.650 s [...] |0| 4.000 KiB access 5 % 0 ns |0| 12.000 KiB access 5 % 0 ns |0| 188.000 KiB access 0 % 0 ns |0| 24.000 KiB access 0 % 0 ns |0| 48.000 KiB access 0 % 0 ns [...] |0000000000000000000000000000000| 8.000 KiB access 0 % 6 m 45.901 s |00000000000000000000000000000000| 36.000 KiB access 0 % 7 m 26.491 s |00000000000000000000000000000000| 4.000 KiB access 0 % 12 m 37.682 s |000000000000000000000000000000000| 8.000 KiB access 0 % 18 m 9.168 s |000000000000000000000000000000000| 16.000 KiB access 0 % 19 m 3.288 s |0000000000000000000000000000000000000000| 6.198 GiB access 0 % 18 h 57 m 52.582 s memory bw estimate: 8.798 GiB per second total size: 62.000 GiB We can see DAMON found small and extremely hot regions that accessed for all access check sampling (once per about 300 milliseconds) for more than 10 hours. The access temperature rapidly decreases. DAMON was also able to find small and big regions that not accessed for up to about 19 minutes. It even found an outlier cold region of 6 GiB that not accessed for about 19 hours. It is unclear what the outlier region is, as of this writing. For the testing, DAMON was consuming about 0.1% of single CPU time. This is again expected results, since DAMON was using about 370 milliseconds sampling interval in most case. # ps -p $kdamond_pid -o %cpu %CPU 0.1 I also ran similar tests against kernel build workload and an in-memory cache workload benchmark[2]. Detialed results including tuned intervals and captured access pattern were of course different sicne those depend on the workloads. But the auto-tuning feature was always working as expected like the above results for the real world workload. To wrap up, with intervals auto-tuning feature, DAMON was able to capture access pattern snapshots of a quality on a real world server workload. The auto-tuning feature was able to adaptively react to the dynamic access patterns of the workload and reliably provide consistent monitoring results without manual human interventions. Also, the auto-tuning made DAMON consumes only necessary amount of resource for the required quality. References ========== [1] https://github.com/damonitor/damo/blob/next/USAGE.md#access-report-styles [2] https://github.com/facebookresearch/DCPerf/blob/main/packages/tao_bench/README.md This patch (of 8): Add data structures for DAMON sampling and aggregation intervals automatic tuning that aims specific amount of DAMON-observed access events per snapshot. In more detail, define the data structure for the tuning goal, link it to the monitoring attributes data structure so that DAMON kernel API callers can make the request, and update parameters setup DAMON function to respect the new parameter. Link: https://lkml.kernel.org/r/20250303221726.484227-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250303221726.484227-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 242910b190c9..5f2609f24761 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -659,12 +659,38 @@ struct damon_call_control { bool canceled; }; +/** + * struct damon_intervals_goal - Monitoring intervals auto-tuning goal. + * + * @access_bp: Access events observation ratio to achieve in bp. + * @aggrs: Number of aggregations to acheive @access_bp within. + * @min_sample_us: Minimum resulting sampling interval in microseconds. + * @max_sample_us: Maximum resulting sampling interval in microseconds. + * + * DAMON automatically tunes &damon_attrs->sample_interval and + * &damon_attrs->aggr_interval aiming the ratio in bp (1/10,000) of + * DAMON-observed access events to theoretical maximum amount within @aggrs + * aggregations be same to @access_bp. The logic increases + * &damon_attrs->aggr_interval and &damon_attrs->sampling_interval in same + * ratio if the current access events observation ratio is lower than the + * target for each @aggrs aggregations, and vice versa. + * + * If @aggrs is zero, the tuning is disabled and hence this struct is ignored. + */ +struct damon_intervals_goal { + unsigned long access_bp; + unsigned long aggrs; + unsigned long min_sample_us; + unsigned long max_sample_us; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * * @sample_interval: The time between access samplings. * @aggr_interval: The time between monitor results aggregations. * @ops_update_interval: The time between monitoring operations updates. + * @intervals_goal: Intervals auto-tuning goal. * @min_nr_regions: The minimum number of adaptive monitoring * regions. * @max_nr_regions: The maximum number of adaptive monitoring @@ -684,6 +710,7 @@ struct damon_attrs { unsigned long sample_interval; unsigned long aggr_interval; unsigned long ops_update_interval; + struct damon_intervals_goal intervals_goal; unsigned long min_nr_regions; unsigned long max_nr_regions; }; -- cgit v1.2.3 From f04b0fedbe714f822bd066b319a60faa39a985a1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:20 -0800 Subject: mm/damon/core: implement intervals auto-tuning Implement the DAMON sampling and aggregation intervals auto-tuning mechanism as briefly described on 'struct damon_intervals_goal'. The core part for deciding the direction and amount of the changes is implemented reusing the feedback loop function which is being used for DAMOS quotas auto-tuning. Unlike the DAMOS quotas auto-tuning use case, limit the maximum decreasing amount after the adjustment to 50% of the current value, though. This is because the intervals have no good merits at rapid reductions since it could unnecessarily increase the monitoring overhead. Link: https://lkml.kernel.org/r/20250303221726.484227-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5f2609f24761..b3e2c793c1f4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -713,6 +713,17 @@ struct damon_attrs { struct damon_intervals_goal intervals_goal; unsigned long min_nr_regions; unsigned long max_nr_regions; +/* private: internal use only */ + /* + * @aggr_interval to @sample_interval ratio. + * Core-external components call damon_set_attrs() with &damon_attrs + * that this field is unset. In the case, damon_set_attrs() sets this + * field of resulting &damon_attrs. Core-internal components such as + * kdamond_tune_intervals() calls damon_set_attrs() with &damon_attrs + * that this field is set. In the case, damon_set_attrs() just keep + * it. + */ + unsigned long aggr_samples; }; /** @@ -761,6 +772,11 @@ struct damon_ctx { * update */ unsigned long next_ops_update_sis; + /* + * number of sample intervals that should be passed before next + * intervals tuning + */ + unsigned long next_intervals_tune_sis; /* for waiting until the execution of the kdamond_fn is started */ struct completion kdamond_started; /* for scheme quotas prioritization */ -- cgit v1.2.3 From 691ee97e1a9de0cdb3efb893c1f180e3f4a35e32 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 3 Mar 2025 14:15:35 +0000 Subject: mm: fix lazy mmu docs and usage Patch series "Fix lazy mmu mode", v2. I'm planning to implement lazy mmu mode for arm64 to optimize vmalloc. As part of that, I will extend lazy mmu mode to cover kernel mappings in vmalloc table walkers. While lazy mmu mode is already used for kernel mappings in a few places, this will extend it's use significantly. Having reviewed the existing lazy mmu implementations in powerpc, sparc and x86, it looks like there are a bunch of bugs, some of which may be more likely to trigger once I extend the use of lazy mmu. So this series attempts to clarify the requirements and fix all the bugs in advance of that series. See patch #1 commit log for all the details. This patch (of 5): The docs, implementations and use of arch_[enter|leave]_lazy_mmu_mode() is a bit of a mess (to put it politely). There are a number of issues related to nesting of lazy mmu regions and confusion over whether the task, when in a lazy mmu region, is preemptible or not. Fix all the issues relating to the core-mm. Follow up commits will fix the arch-specific implementations. 3 arches implement lazy mmu; powerpc, sparc and x86. When arch_[enter|leave]_lazy_mmu_mode() was first introduced by commit 6606c3e0da53 ("[PATCH] paravirt: lazy mmu mode hooks.patch"), it was expected that lazy mmu regions would never nest and that the appropriate page table lock(s) would be held while in the region, thus ensuring the region is non-preemptible. Additionally lazy mmu regions were only used during manipulation of user mappings. Commit 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates") started invoking the lazy mmu mode in apply_to_pte_range(), which is used for both user and kernel mappings. For kernel mappings the region is no longer protected by any lock so there is no longer any guarantee about non-preemptibility. Additionally, for RT configs, the holding the PTL only implies no CPU migration, it doesn't prevent preemption. Commit bcc6cc832573 ("mm: add default definition of set_ptes()") added arch_[enter|leave]_lazy_mmu_mode() to the default implementation of set_ptes(), used by x86. So after this commit, lazy mmu regions can be nested. Additionally commit 1a10a44dfc1d ("sparc64: implement the new page table range API") and commit 9fee28baa601 ("powerpc: implement the new page table range API") did the same for the sparc and powerpc set_ptes() overrides. powerpc couldn't deal with preemption so avoids it in commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu mode"), which explicitly disables preemption for the whole region in its implementation. x86 can support preemption (or at least it could until it tried to add support nesting; more on this below). Sparc looks to be totally broken in the face of preemption, as far as I can tell. powerpc can't deal with nesting, so avoids it in commit 47b8def9358c ("powerpc/mm: Avoid calling arch_enter/leave_lazy_mmu() in set_ptes"), which removes the lazy mmu calls from its implementation of set_ptes(). x86 attempted to support nesting in commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode") but as far as I can tell, this breaks its support for preemption. In short, it's all a mess; the semantics for arch_[enter|leave]_lazy_mmu_mode() are not clearly defined and as a result the implementations all have different expectations, sticking plasters and bugs. arm64 is aiming to start using these hooks, so let's clean everything up before adding an arm64 implementation. Update the documentation to state that lazy mmu regions can never be nested, must not be called in interrupt context and preemption may or may not be enabled for the duration of the region. And fix the generic implementation of set_ptes() to avoid nesting. arch-specific fixes to conform to the new spec will proceed this one. These issues were spotted by code review and I have no evidence of issues being reported in the wild. Link: https://lkml.kernel.org/r/20250303141542.3371656-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20250303141542.3371656-2-ryan.roberts@arm.com Fixes: bcc6cc832573 ("mm: add default definition of set_ptes()") Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Juergen Gross Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Juegren Gross Cc: Matthew Wilcow (Oracle) Cc: Thomas Gleinxer Cc: Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94d267d02372..787c632ee2c9 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -222,10 +222,14 @@ static inline int pmd_dirty(pmd_t pmd) * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be - * up to date. This mode can only be entered and left under the protection of - * the page table locks for all page tables which may be modified. In the UP - * case, this is required so that preemption is disabled, and in the SMP case, - * it must synchronize the delayed page table writes properly on other CPUs. + * up to date. + * + * In the general case, no lock is guaranteed to be held between entry and exit + * of the lazy mode. So the implementation must assume preemption may be enabled + * and cpu migration is possible; it must take steps to be robust against this. + * (In practice, for user PTE updates, the appropriate page table lock(s) are + * held, but for kernel PTE updates, no lock is held). Nesting is not permitted + * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) @@ -287,7 +291,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, { page_table_check_ptes_set(mm, ptep, pte, nr); - arch_enter_lazy_mmu_mode(); for (;;) { set_pte(ptep, pte); if (--nr == 0) @@ -295,7 +298,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, ptep++; pte = pte_next_pfn(pte); } - arch_leave_lazy_mmu_mode(); } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) -- cgit v1.2.3 From 0e2759afcaf9bff25a63201856fa89b64181749f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 27 Feb 2025 23:58:07 -0800 Subject: page_counter: track failcnt only for legacy cgroups Currently page_counter tracks failcnt for counters used by v1 and v2 controllers. However failcnt is only exported for v1 deployment and thus there is no need to maintain it in v2. The oom report does expose failcnt for memory and swap in v2 but v2 already maintains MEMCG_MAX and MEMCG_SWAP_MAX event counters which can be used. Link: https://lkml.kernel.org/r/20250228075808.207484-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin (Cruise) Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 46406f3fe34d..e4bd8fd427be 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -28,12 +28,13 @@ struct page_counter { unsigned long watermark; /* Latest cg2 reset watermark */ unsigned long local_watermark; - unsigned long failcnt; + unsigned long failcnt; /* v1-only field */ /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); bool protection_support; + bool track_failcnt; unsigned long min; unsigned long low; unsigned long high; @@ -58,6 +59,7 @@ static inline void page_counter_init(struct page_counter *counter, counter->max = PAGE_COUNTER_MAX; counter->parent = parent; counter->protection_support = protection_support; + counter->track_failcnt = false; } static inline unsigned long page_counter_read(struct page_counter *counter) -- cgit v1.2.3 From f7b0797d36e75d2a622580b56b9bfd3130d5d0e9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 27 Feb 2025 23:58:08 -0800 Subject: page_counter: reduce struct page_counter size The struct page_counter has explicit padding for better cache alignment. The commit c6f53ed8f213a ("mm, memcg: cg2 memory{.swap,}.peak write handlers") added a field to the struct page_counter and accidently increased its size. Let's move the failcnt field which is v1-only field to the same cacheline of usage to reduce the size of struct page_counter. Link: https://lkml.kernel.org/r/20250228075808.207484-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin (Cruise) Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index e4bd8fd427be..d649b6bbbc87 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -9,10 +9,12 @@ struct page_counter { /* - * Make sure 'usage' does not share cacheline with any other field. The - * memcg->memory.usage is a hot member of struct mem_cgroup. + * Make sure 'usage' does not share cacheline with any other field in + * v2. The memcg->memory.usage is a hot member of struct mem_cgroup. */ atomic_long_t usage; + unsigned long failcnt; /* v1-only field */ + CACHELINE_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ @@ -28,7 +30,6 @@ struct page_counter { unsigned long watermark; /* Latest cg2 reset watermark */ unsigned long local_watermark; - unsigned long failcnt; /* v1-only field */ /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); -- cgit v1.2.3 From f1ab2831e2a4312046bca79256b2efc41d373eaf Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Tue, 4 Mar 2025 19:03:16 +0800 Subject: writeback: let trace_balance_dirty_pages() take struct dtc as parameter Patch series "Fix calculations in trace_balance_dirty_pages() for cgwb", v2. In my experiment, I found that the output of trace_balance_dirty_pages() in the cgroup writeback scenario was strange because trace_balance_dirty_pages() always uses global_wb_domain.dirty_limit for related calculations instead of the dirty_limit of the corresponding memcg's wb_domain. The basic idea of the fix is to store the hard dirty limit value computed in wb_position_ratio() into struct dirty_throttle_control and use it for calculations in trace_balance_dirty_pages(). This patch (of 3): Currently, trace_balance_dirty_pages() already has 12 parameters. In the patch #3, I initially attempted to introduce an additional parameter. However, in include/linux/trace_events.h, bpf_trace_run12() only supports up to 12 parameters and bpf_trace_run13() does not exist. To reduce the number of parameters in trace_balance_dirty_pages(), we can make it accept a pointer to struct dirty_throttle_control as a parameter. To achieve this, we need to move the definition of struct dirty_throttle_control from mm/page-writeback.c to include/linux/writeback.h. Link: https://lkml.kernel.org/r/20250304110318.159567-1-yizhou.tang@shopee.com Link: https://lkml.kernel.org/r/20250304110318.159567-2-yizhou.tang@shopee.com Signed-off-by: Tang Yizhou Cc: Alexei Starovoitov Cc: Christian Brauner Cc: Steven Rostedt Cc: Jan Kara Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcow (Oracle) Cc: Tang Yizhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/writeback.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d11b903c2edb..32095928365c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -313,6 +313,29 @@ static inline void cgroup_writeback_umount(struct super_block *sb) /* * mm/page-writeback.c */ +/* consolidated parameters for balance_dirty_pages() and its subroutines */ +struct dirty_throttle_control { +#ifdef CONFIG_CGROUP_WRITEBACK + struct wb_domain *dom; + struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ +#endif + struct bdi_writeback *wb; + struct fprop_local_percpu *wb_completions; + + unsigned long avail; /* dirtyable */ + unsigned long dirty; /* file_dirty + write + nfs */ + unsigned long thresh; /* dirty threshold */ + unsigned long bg_thresh; /* dirty background threshold */ + + unsigned long wb_dirty; /* per-wb counterparts */ + unsigned long wb_thresh; + unsigned long wb_bg_thresh; + + unsigned long pos_ratio; + bool freerun; + bool dirty_exceeded; +}; + void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); void laptop_mode_timer_fn(struct timer_list *t); -- cgit v1.2.3 From 6cc4c3aa714bc58ec5d20f3054ca5f23534984d1 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Tue, 4 Mar 2025 19:03:18 +0800 Subject: writeback: fix calculations in trace_balance_dirty_pages() for cgwb In the commit dcc25ae76eb7 ("writeback: move global_dirty_limit into wb_domain") of the cgroup writeback backpressure propagation patchset, Tejun made some adaptations to trace_balance_dirty_pages() for cgroup writeback. However, this adaptation was incomplete and Tejun missed further adaptation in the subsequent patches. In the cgroup writeback scenario, if sdtc in balance_dirty_pages() is assigned to mdtc, then upon entering trace_balance_dirty_pages(), __entry->limit should be assigned based on the dirty_limit of the corresponding memcg's wb_domain, rather than global_wb_domain. To address this issue and simplify the implementation, introduce a 'limit' field in struct dirty_throttle_control to store the hard_limit value computed in wb_position_ratio() by calling hard_dirty_limit(). This field will then be used in trace_balance_dirty_pages() to assign the value to __entry->limit. Link: https://lkml.kernel.org/r/20250304110318.159567-4-yizhou.tang@shopee.com Fixes: dcc25ae76eb7 ("writeback: move global_dirty_limit into wb_domain") Signed-off-by: Tang Yizhou Acked-by: Tejun Heo Cc: Alexei Starovoitov Cc: Christian Brauner Cc: Jan Kara Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcow (Oracle) Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 32095928365c..58bda3347914 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -326,6 +326,7 @@ struct dirty_throttle_control { unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ + unsigned long limit; /* hard dirty limit */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; -- cgit v1.2.3 From ab82e57981d0e4cb46d2817e7b65b9d5fdcf3832 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 4 Mar 2025 13:19:05 -0800 Subject: mm/damon/core: introduce damos->ops_filters Patch series "mm/damon: make allow filters after reject filters useful and intuitive". DAMOS filters do allow or reject elements of memory for given DAMOS scheme only if those match the filter criterias. For elements that don't match any DAMOS filter, 'allowing' is the default behavior. This makes allow-filters that don't have any reject-filter after them meaningless sources of overhead. The decision was made to keep the behavior consistent with that before the introduction of allow-filters. This, however, makes usage of DAMOS filters confusing and inefficient. It is more intuitive and still consistent behavior to reject by default unless there is no filter at all or the last filter is a reject filter. Update the filtering logic in the way and update documents to clarify the behavior. Note that this is changing the old behavior. But the old behavior for the problematic filter combination was definitely confusing, inefficient and anyway useless. Also, the behavior has relatively recently introduced. It is difficult to anticipate any user that depends on the behavior. Hence this is not a user-breaking behavior change but an obvious improvement. This patch (of 9): DAMOS filters can be categorized into two groups depending on which layer they are handled, namely core layer and ops layer. The groups are important because the filtering behavior depends on evaluation sequence of filters, and core layer-handled filters are evaluated before operations layer-handled ones. The behavior is clearly documented, but the implementation is bit inefficient and complicated. All filters are maintained in a single list (damos->filters) in mix. Filters evaluation logics in core layer and operations layer iterates all the filters on the list, while skipping filters that should be not handled by the layer of the logic. It is inefficient. Making future extensions having differentiations for filters of different handling layers will also be complicated. Add a new list that will be used for having all operations layer-handled DAMOS filters to DAMOS scheme data structure. Also add the support of its initialization and basic traversal functions. Link: https://lkml.kernel.org/r/20250304211913.53574-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250304211913.53574-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index b3e2c793c1f4..7f76e2e99f37 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -448,6 +448,7 @@ struct damos_access_pattern { * @wmarks: Watermarks for automated (in)activation of this scheme. * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. + * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. * @list: List head for siblings. @@ -508,6 +509,7 @@ struct damos { int target_nid; }; struct list_head filters; + struct list_head ops_filters; void *last_applied; struct damos_stat stat; struct list_head list; @@ -858,6 +860,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damos_for_each_filter_safe(f, next, scheme) \ list_for_each_entry_safe(f, next, &(scheme)->filters, list) +#define damos_for_each_ops_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->ops_filters, list) + +#define damos_for_each_ops_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->ops_filters, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); -- cgit v1.2.3 From dd038b728c8a2a0e1a632b767a50f09f076dab79 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 4 Mar 2025 13:19:10 -0800 Subject: mm/damon: add default allow/reject behavior fields to struct damos Current default allow/reject behavior of filters handling stage has made before introduction of the allow behavior. For allow-filters usage, it is confusing and inefficient. It is more intuitive to decide the default filtering stage allow/reject behavior as opposite to the last filter's behavior. The decision should be made separately for core and operations layers' filtering stages, since last core layer-handled filter is not really a last filter if there are operations layer handling filters. Keeping separate decisions for the two categories can make the logic simpler. Add fields for storing the two decisions. Link: https://lkml.kernel.org/r/20250304211913.53574-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 7f76e2e99f37..52559475dbe7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -502,6 +502,9 @@ struct damos { * layer-handled filters. If true, operations layer allows it, too. */ bool core_filters_allowed; + /* whether to reject core/ops filters umatched regions */ + bool core_filters_default_reject; + bool ops_filters_default_reject; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; -- cgit v1.2.3 From 9bbe033c75a56d72fc35e7c8ca6f3258d9782fa5 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:29 +0000 Subject: mm: zpool: add interfaces for object read/write APIs Patch series "Switch zswap to object read/write APIs". This patch series updates zswap to use the new object read/write APIs defined by zsmalloc in [1], and remove the old object mapping APIs and the related code from zpool and zsmalloc. This patch (of 5): Zsmalloc introduced new APIs to read/write objects besides mapping them. Add the necessary zpool interfaces. Link: https://lkml.kernel.org/r/20250305061134.4105762-1-yosry.ahmed@linux.dev Link: https://lkml.kernel.org/r/20250305061134.4105762-2-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/zpool.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 5e6dc46b8cc4..1784e735ee04 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -52,6 +52,16 @@ void *zpool_map_handle(struct zpool *pool, unsigned long handle, void zpool_unmap_handle(struct zpool *pool, unsigned long handle); + +void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, + void *local_copy); + +void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, + void *handle_mem); + +void zpool_obj_write(struct zpool *zpool, unsigned long handle, + void *handle_mem, size_t mem_len); + u64 zpool_get_total_pages(struct zpool *pool); @@ -90,6 +100,13 @@ struct zpool_driver { enum zpool_mapmode mm); void (*unmap)(void *pool, unsigned long handle); + void *(*obj_read_begin)(void *pool, unsigned long handle, + void *local_copy); + void (*obj_read_end)(void *pool, unsigned long handle, + void *handle_mem); + void (*obj_write)(void *pool, unsigned long handle, + void *handle_mem, size_t mem_len); + u64 (*total_pages)(void *pool); }; -- cgit v1.2.3 From fcbea574754c63f7035d0c4ef7dfb161b60b5bde Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:31 +0000 Subject: mm: zpool: remove object mapping APIs zpool_map_handle(), zpool_unmap_handle(), and zpool_can_sleep_mapped() are no longer used. Remove them with the underlying driver callbacks. Link: https://lkml.kernel.org/r/20250305061134.4105762-4-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/zpool.h | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 1784e735ee04..2c8a9d2654f6 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -13,25 +13,6 @@ struct zpool; -/* - * Control how a handle is mapped. It will be ignored if the - * implementation does not support it. Its use is optional. - * Note that this does not refer to memory protection, it - * refers to how the memory will be copied in/out if copying - * is necessary during mapping; read-write is the safest as - * it copies the existing memory in on map, and copies the - * changed memory back out on unmap. Write-only does not copy - * in the memory and should only be used for initialization. - * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. - */ -enum zpool_mapmode { - ZPOOL_MM_RW, /* normal read-write mapping */ - ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ - ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ - - ZPOOL_MM_DEFAULT = ZPOOL_MM_RW -}; - bool zpool_has_pool(char *type); struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp); @@ -47,12 +28,6 @@ int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, void zpool_free(struct zpool *pool, unsigned long handle); -void *zpool_map_handle(struct zpool *pool, unsigned long handle, - enum zpool_mapmode mm); - -void zpool_unmap_handle(struct zpool *pool, unsigned long handle); - - void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, void *local_copy); @@ -95,11 +70,6 @@ struct zpool_driver { unsigned long *handle); void (*free)(void *pool, unsigned long handle); - bool sleep_mapped; - void *(*map)(void *pool, unsigned long handle, - enum zpool_mapmode mm); - void (*unmap)(void *pool, unsigned long handle); - void *(*obj_read_begin)(void *pool, unsigned long handle, void *local_copy); void (*obj_read_end)(void *pool, unsigned long handle, -- cgit v1.2.3 From 07864f1a57fb1f798a7d21f13e4929c9cb52daf7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:32 +0000 Subject: mm: zsmalloc: remove object mapping APIs and per-CPU map areas zs_map_object() and zs_unmap_object() are no longer used, remove them. Since these are the only users of per-CPU mapping_areas, remove them and the associated CPU hotplug callbacks too. [yosry.ahmed@linux.dev: update the docs] Link: https://lkml.kernel.org/r/Z8ier-ZZp8T6MOTH@google.com Link: https://lkml.kernel.org/r/20250305061134.4105762-5-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Acked-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/cpuhotplug.h | 1 - include/linux/zsmalloc.h | 21 --------------------- 2 files changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 6cc5e484547c..1987400000b4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -116,7 +116,6 @@ enum cpuhp_state { CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, - CPUHP_MM_ZS_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 7d70983cf398..c26baf9fb331 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -16,23 +16,6 @@ #include -/* - * zsmalloc mapping modes - * - * NOTE: These only make a difference when a mapped object spans pages. - */ -enum zs_mapmode { - ZS_MM_RW, /* normal read-write mapping */ - ZS_MM_RO, /* read-only (no copy-out at unmap time) */ - ZS_MM_WO /* write-only (no copy-in at map time) */ - /* - * NOTE: ZS_MM_WO should only be used for initializing new - * (uninitialized) allocations. Partial writes to already - * initialized allocations should use ZS_MM_RW to preserve the - * existing data. - */ -}; - struct zs_pool_stats { /* How many pages were migrated (freed) */ atomic_long_t pages_compacted; @@ -48,10 +31,6 @@ void zs_free(struct zs_pool *pool, unsigned long obj); size_t zs_huge_class_size(struct zs_pool *pool); -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm); -void zs_unmap_object(struct zs_pool *pool, unsigned long handle); - unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); -- cgit v1.2.3 From 7b60041156079f256a7df3f7de469de7db618580 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:33 +0000 Subject: mm: zpool: remove zpool_malloc_support_movable() zpool_malloc_support_movable() always returns true for zsmalloc, the only remaining zpool driver. Remove it and set the gfp flags in zswap_compress() accordingly. Opportunistically use GFP_NOWAIT instead of __GFP_NOWARN | __GFP_KSWAPD_RECLAIM for conciseness as they are equivalent. Link: https://lkml.kernel.org/r/20250305061134.4105762-6-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Thomas Gleixner Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/zpool.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 2c8a9d2654f6..52f30e526607 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -21,8 +21,6 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); -bool zpool_malloc_support_movable(struct zpool *pool); - int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, unsigned long *handle); @@ -65,7 +63,6 @@ struct zpool_driver { void *(*create)(const char *name, gfp_t gfp); void (*destroy)(void *pool); - bool malloc_support_movable; int (*malloc)(void *pool, size_t size, gfp_t gfp, unsigned long *handle); void (*free)(void *pool, unsigned long handle); -- cgit v1.2.3 From c53e14f1ea4a8f8ddd9b2cd850fcbc0d934b79f5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 10 Mar 2025 11:15:36 -0700 Subject: perf: Extend per event callchain limit to branch stack The commit 97c79a38cd45 ("perf core: Per event callchain limit") introduced a per-event term to allow finer tuning of the depth of callchains to save space. It should be applied to the branch stack as well. For example, autoFDO collections require maximum LBR entries. In the meantime, other system-wide LBR users may only be interested in the latest a few number of LBRs. A per-event LBR depth would save the perf output buffer. The patch simply drops the uninterested branches, but HW still collects the maximum branches. There may be a model-specific optimization that can reduce the HW depth for some cases to reduce the overhead further. But it isn't included in the patch set. Because it's not useful for all cases. For example, ARCH LBR can utilize the PEBS and XSAVE to collect LBRs. The depth should have less impact on the collecting overhead. The model-specific optimization may be implemented later separately. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310181536.3645382-1-kan.liang@linux.intel.com --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 76f4265efee9..3e270822b915 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1347,6 +1347,9 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, if (branch_sample_hw_index(event)) size += sizeof(u64); + + brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr); + size += brs->nr * sizeof(struct perf_branch_entry); /* -- cgit v1.2.3 From cb4369129339060218baca718a578bb0b826e734 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 14 Mar 2025 10:26:54 -0700 Subject: perf: Save PMU specific data in task_struct Some PMU specific data has to be saved/restored during context switch, e.g. LBR call stack data. Currently, the data is saved in event context structure, but only for per-process event. For system-wide event, because of missing the LBR call stack data after context switch, LBR callstacks are always shorter in comparison to per-process mode. For example, Per-process mode: $perf record --call-graph lbr -- taskset -c 0 ./tchain_edit - 99.90% 99.86% tchain_edit tchain_edit [.] f3 99.86% _start __libc_start_main generic_start_main main f1 - f2 f3 System-wide mode: $perf record --call-graph lbr -a -- taskset -c 0 ./tchain_edit - 99.88% 99.82% tchain_edit tchain_edit [.] f3 - 62.02% main f1 f2 f3 - 28.83% f1 - f2 f3 - 28.83% f1 - f2 f3 - 8.88% generic_start_main main f1 f2 f3 It isn't practical to simply allocate the data for system-wide event in CPU context structure for all tasks. We have no idea which CPU a task will be scheduled to. The duplicated LBR data has to be maintained on every CPU context structure. That's a huge waste. Otherwise, the LBR data still lost if the task is scheduled to another CPU. Save the pmu specific data in task_struct. The size of pmu specific data is 788 bytes for LBR call stack. Usually, the overall amount of threads doesn't exceed a few thousands. For 10K threads, keeping LBR data would consume additional ~8MB. The additional space will only be allocated during LBR call stack monitoring. It will be released when the monitoring is finished. Furthermore, moving task_ctx_data from perf_event_context to task_struct can reduce complexity and make things clearer. E.g. perf doesn't need to swap task_ctx_data on optimized context switch path. This patch set is just the first step. There could be other optimization/extension on top of this patch set. E.g. for cgroup profiling, perf just needs to save/store the LBR call stack information for tasks in specific cgroup. That could reduce the additional space. Also, the LBR call stack can be available for software events, or allow even debugging use cases, like LBRs on crash later. Because of the alignment requirement of Intel Arch LBR, the Kmem cache is used to allocate the PMU specific data. It's required when child task allocates the space. Save it in struct perf_ctx_data. The refcount in struct perf_ctx_data is used to track the users of pmu specific data. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexey Budankov Link: https://lore.kernel.org/r/20250314172700.438923-1-kan.liang@linux.intel.com --- include/linux/perf_event.h | 35 +++++++++++++++++++++++++++++++++++ include/linux/sched.h | 2 ++ 2 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3e270822b915..75d9b1e93f39 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1021,6 +1021,41 @@ struct perf_event_context { local_t nr_no_switch_fast; }; +/** + * struct perf_ctx_data - PMU specific data for a task + * @rcu_head: To avoid the race on free PMU specific data + * @refcount: To track users + * @global: To track system-wide users + * @ctx_cache: Kmem cache of PMU specific data + * @data: PMU specific data + * + * Currently, the struct is only used in Intel LBR call stack mode to + * save/restore the call stack of a task on context switches. + * + * The rcu_head is used to prevent the race on free the data. + * The data only be allocated when Intel LBR call stack mode is enabled. + * The data will be freed when the mode is disabled. + * The content of the data will only be accessed in context switch, which + * should be protected by rcu_read_lock(). + * + * Because of the alignment requirement of Intel Arch LBR, the Kmem cache + * is used to allocate the PMU specific data. The ctx_cache is to track + * the Kmem cache. + * + * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct. + * When system-wide Intel LBR call stack mode is enabled, a buffer with + * constant size will be allocated for each task. + * Also, system memory consumption can further grow when the size of + * struct perf_ctx_data enlarges. + */ +struct perf_ctx_data { + struct rcu_head rcu_head; + refcount_t refcount; + int global; + struct kmem_cache *ctx_cache; + void *data; +}; + struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; diff --git a/include/linux/sched.h b/include/linux/sched.h index 9632e3318e0d..7e183eeb50ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,6 +65,7 @@ struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; +struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; @@ -1311,6 +1312,7 @@ struct task_struct { struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; + struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; -- cgit v1.2.3 From fdfda868ee3b5da1fbdb7710b731e09d8dd3a615 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Fri, 14 Mar 2025 10:26:55 -0700 Subject: locking/percpu-rwsem: Add guard support To simplify the usage of the percpu rw semaphore. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250314172700.438923-2-kan.liang@linux.intel.com --- include/linux/percpu-rwsem.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index c012df33a9f0..af7d75ede619 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -8,6 +8,7 @@ #include #include #include +#include struct percpu_rw_semaphore { struct rcu_sync rss; @@ -125,6 +126,13 @@ extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); +DEFINE_GUARD(percpu_read, struct percpu_rw_semaphore *, + percpu_down_read(_T), percpu_up_read(_T)) +DEFINE_GUARD_COND(percpu_read, _try, percpu_down_read_trylock(_T)) + +DEFINE_GUARD(percpu_write, struct percpu_rw_semaphore *, + percpu_down_write(_T), percpu_up_write(_T)) + static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) { return atomic_read(&sem->block); -- cgit v1.2.3 From 506e64e710ff9573fd2b86686528762b7901b5e4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 14 Mar 2025 10:26:56 -0700 Subject: perf: attach/detach PMU specific data The LBR call stack data has to be saved/restored during context switch to fix the shorter LBRs call stacks issue in the system-wide mode. Allocate PMU specific data and attach them to the corresponding task_struct during LBR call stack monitoring. When a LBR call stack event is accounted, the perf_ctx_data for the related tasks will be allocated/attached by attach_perf_ctx_data(). When a LBR call stack event is unaccounted, the perf_ctx_data for related tasks will be detached/freed by detach_perf_ctx_data(). The LBR call stack event could be a per-task event or a system-wide event. - For a per-task event, perf only allocates the perf_ctx_data for the current task. If the allocation fails, perf will error out. - For a system-wide event, perf has to allocate the perf_ctx_data for both the existing tasks and the upcoming tasks. The allocation for the existing tasks is done in perf_event_alloc(). If any allocation fails, perf will error out. The allocation for the new tasks will be done in perf_event_fork(). A global reader/writer semaphore, global_ctx_data_rwsem, is added to address the global race. - The perf_ctx_data only be freed by the last LBR call stack event. The number of the per-task events is tracked by refcount of each task. Since the system-wide events impact all tasks, it's not practical to go through the whole task list to update the refcount for each system-wide event. The number of system-wide events is tracked by a global variable global_ctx_data_ref. Suggested-by: "Peter Zijlstra (Intel)" Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250314172700.438923-3-kan.liang@linux.intel.com --- include/linux/perf_event.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 75d9b1e93f39..2551170c0d18 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -676,11 +676,12 @@ struct swevent_hlist { #define PERF_ATTACH_GROUP 0x0002 #define PERF_ATTACH_TASK 0x0004 #define PERF_ATTACH_TASK_DATA 0x0008 -#define PERF_ATTACH_ITRACE 0x0010 +#define PERF_ATTACH_GLOBAL_DATA 0x0010 #define PERF_ATTACH_SCHED_CB 0x0020 #define PERF_ATTACH_CHILD 0x0040 #define PERF_ATTACH_EXCLUSIVE 0x0080 #define PERF_ATTACH_CALLCHAIN 0x0100 +#define PERF_ATTACH_ITRACE 0x0200 struct bpf_prog; struct perf_cgroup; -- cgit v1.2.3 From d57e94f5b891925e4f2796266eba31edd5a01903 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 14 Mar 2025 10:26:57 -0700 Subject: perf: Supply task information to sched_task() To save/restore LBR call stack data in system-wide mode, the task_struct information is required. Extend the parameters of sched_task() to supply task_struct information. When schedule in, the LBR call stack data for new task will be restored. When schedule out, the LBR call stack data for old task will be saved. Only need to pass the required task_struct information. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250314172700.438923-4-kan.liang@linux.intel.com --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2551170c0d18..58f40c89d728 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -494,7 +494,7 @@ struct pmu { * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, - bool sched_in); + struct task_struct *task, bool sched_in); /* * Kmem cache of PMU specific data -- cgit v1.2.3 From 8bdc5daaa01e3054647d394d354762210ad88f17 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 14 Mar 2025 17:08:02 +0100 Subject: sched: Add a generic function to return the preemption string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The individual architectures often add the preemption model to the begin of the backtrace. This is the case on X86 or ARM64 for the "die" case but not for regular warning. With the addition of DYNAMIC_PREEMPT for PREEMPT_RT we end up with CONFIG_PREEMPT and CONFIG_PREEMPT_RT set simultaneously. That means that everyone who tried to add that piece of information gets it wrong for PREEMPT_RT because PREEMPT is checked first. Provide a generic function which returns the current scheduling model considering LAZY preempt and the current state of PREEMPT_DYNAMIC. The resulting strings are: ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Model ┃ -RT -DYN ┃ +RT -DYN ┃ -RT +DYN ┃ +RT +DYN ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │NONE │ NONE │ n/a │ PREEMPT(none) │ n/a │ ├───────────┼──────────────┼───────────────────┼────────────────────┼───────────────────┤ │VOLUNTARY │ VOLUNTARY │ n/a │ PREEMPT(voluntary) │ n/a │ ├───────────┼──────────────┼───────────────────┼────────────────────┼───────────────────┤ │FULL │ PREEMPT │ PREEMPT_RT │ PREEMPT(full) │ PREEMPT_{RT,full} │ ├───────────┼──────────────┼───────────────────┼────────────────────┼───────────────────┤ │LAZY │ PREEMPT_LAZY │ PREEMPT_{RT,LAZY} │ PREEMPT(lazy) │ PREEMPT_{RT,lazy} │ └───────────┴──────────────┴───────────────────┴────────────────────┴───────────────────┘ [ The dynamic building of the string can lead to an empty string if the function is invoked simultaneously on two CPUs. ] Co-developed-by: "Peter Zijlstra (Intel)" Signed-off-by: "Peter Zijlstra (Intel)" Co-developed-by: "Steven Rostedt (Google)" Signed-off-by: "Steven Rostedt (Google)" Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20250314160810.2373416-2-bigeasy@linutronix.de --- include/linux/preempt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index ca86235ac15c..3e9808f2b549 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -515,6 +515,8 @@ static inline bool preempt_model_rt(void) return IS_ENABLED(CONFIG_PREEMPT_RT); } +extern const char *preempt_model_str(void); + /* * Does the preemption model allow non-cooperative preemption? * -- cgit v1.2.3 From bd2da08d9363d191551d79e5b04121348e18af5a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 14 Mar 2025 10:27:00 -0700 Subject: perf: Clean up pmu specific data The pmu specific data is saved in task_struct now. Remove it from event context structure. Remove swap_task_ctx() as well. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250314172700.438923-7-kan.liang@linux.intel.com --- include/linux/perf_event.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 58f40c89d728..5b8e3aabac02 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -501,16 +501,6 @@ struct pmu { */ struct kmem_cache *task_ctx_cache; - /* - * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) - * can be synchronized using this function. See Intel LBR callstack support - * implementation and Perf core context switch handling callbacks for usage - * examples. - */ - void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, - struct perf_event_pmu_context *next_epc); - /* optional */ - /* * Set up pmu-private data structures for an AUX area */ @@ -933,7 +923,6 @@ struct perf_event_pmu_context { atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; - void *task_ctx_data; /* pmu specific data */ /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to @@ -981,7 +970,6 @@ struct perf_event_context { int nr_user; int is_active; - int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; -- cgit v1.2.3 From 56209334dda1832c0a919e1d74768c6d0f3b2ca9 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:03:32 +0100 Subject: sched/topology: Wrappers for sched_domains_mutex Create wrappers for sched_domains_mutex so that it can transparently be used on both CONFIG_SMP and !CONFIG_SMP, as some function will need to do. Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug") Reported-by: Jon Hunter Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Waiman Long Tested-by: Jon Hunter Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MP5Oq9RB8jBs3y@jlelli-thinkpadt14gen4.remote.csb --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9632e3318e0d..0785268c76f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -382,6 +382,11 @@ enum uclamp_id { #ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; +extern void sched_domains_mutex_lock(void); +extern void sched_domains_mutex_unlock(void); +#else +static inline void sched_domains_mutex_lock(void) { } +static inline void sched_domains_mutex_unlock(void) { } #endif struct sched_param { -- cgit v1.2.3 From 45007c6fb5860cf63556a9cadc87c8984927e23d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:05:46 +0100 Subject: sched/deadline: Generalize unique visiting of root domains Bandwidth checks and updates that work on root domains currently employ a cookie mechanism for efficiency. This mechanism is very much tied to when root domains are first created and initialized. Generalize the cookie mechanism so that it can be used also later at runtime while updating root domains. Also, additionally guard it with sched_domains_mutex, since domains need to be stable while updating them (and it will be required for further dynamic changes). Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug") Reported-by: Jon Hunter Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Waiman Long Tested-by: Jon Hunter Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MQaiXPvEeW_v7x@jlelli-thinkpadt14gen4.remote.csb --- include/linux/sched/deadline.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 3a912ab42bb5..6ec578600b24 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -37,4 +37,7 @@ extern void dl_clear_root_domain(struct root_domain *rd); #endif /* CONFIG_SMP */ +extern u64 dl_cookie; +extern bool dl_bw_visited(int cpu, u64 cookie); + #endif /* _LINUX_SCHED_DEADLINE_H */ -- cgit v1.2.3 From 2ff899e3516437354204423ef0a94994717b8e6a Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:10:21 +0100 Subject: sched/deadline: Rebuild root domain accounting after every update Rebuilding of root domains accounting information (total_bw) is currently broken on some cases, e.g. suspend/resume on aarch64. Problem is that the way we keep track of domain changes and try to add bandwidth back is convoluted and fragile. Fix it by simplify things by making sure bandwidth accounting is cleared and completely restored after root domains changes (after root domains are again stable). To be sure we always call dl_rebuild_rd_accounting while holding cpuset_mutex we also add cpuset_reset_sched_domains() wrapper. Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug") Reported-by: Jon Hunter Co-developed-by: Waiman Long Signed-off-by: Waiman Long Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MRfeJKJUOyUSto@jlelli-thinkpadt14gen4.remote.csb --- include/linux/cpuset.h | 6 ++++++ include/linux/sched/deadline.h | 1 + include/linux/sched/topology.h | 2 ++ 3 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 835e7b793f6a..17cc90d900f9 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -128,6 +128,7 @@ extern bool current_cpuset_is_being_rebound(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); +extern void cpuset_reset_sched_domains(void); /* * read_mems_allowed_begin is required when making decisions involving @@ -264,6 +265,11 @@ static inline void rebuild_sched_domains(void) partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_reset_sched_domains(void) +{ + partition_sched_domains(1, NULL, NULL); +} + static inline void cpuset_print_current_mems_allowed(void) { } diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 6ec578600b24..f9aabbc9d22e 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -34,6 +34,7 @@ static inline bool dl_time_before(u64 a, u64 b) struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); +extern void dl_clear_root_domain_cpu(int cpu); #endif /* CONFIG_SMP */ diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7f3dbafe1817..1622232bd08b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -166,6 +166,8 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } +extern void dl_rebuild_rd_accounting(void); + extern void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); -- cgit v1.2.3 From d128130f486b4aa86086655af0fbb943b26b0003 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:12:43 +0100 Subject: sched/topology: Stop exposing partition_sched_domains_locked The are no callers of partition_sched_domains_locked() outside topology.c. Stop exposing such function. Suggested-by: Waiman Long Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Waiman Long Tested-by: Jon Hunter Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MSC96a8FcqWV3G@jlelli-thinkpadt14gen4.remote.csb --- include/linux/sched/topology.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 1622232bd08b..96e69bfc3c8a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -168,10 +168,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) extern void dl_rebuild_rd_accounting(void); -extern void partition_sched_domains_locked(int ndoms_new, - cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new); - extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); @@ -212,12 +208,6 @@ extern void __init set_sched_topology(struct sched_domain_topology_level *tl); struct sched_domain_attr; -static inline void -partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - static inline void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) -- cgit v1.2.3 From 34929a070b7fd06c386080c926b61ee844e6ad34 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:13:29 +0100 Subject: include/{topology,cpuset}: Move dl_rebuild_rd_accounting to cpuset.h dl_rebuild_rd_accounting() is defined in cpuset.c, so it makes more sense to move related declarations to cpuset.h. Implement the move. Suggested-by: Waiman Long Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Waiman Long Tested-by: Jon Hunter Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MSOVMpU7jpVrMU@jlelli-thinkpadt14gen4.remote.csb --- include/linux/cpuset.h | 5 +++++ include/linux/sched/topology.h | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 17cc90d900f9..5466c96a33db 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -125,6 +125,7 @@ static inline int cpuset_do_page_mem_spread(void) extern bool current_cpuset_is_being_rebound(void); +extern void dl_rebuild_rd_accounting(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); @@ -260,6 +261,10 @@ static inline bool current_cpuset_is_being_rebound(void) return false; } +static inline void dl_rebuild_rd_accounting(void) +{ +} + static inline void rebuild_sched_domains(void) { partition_sched_domains(1, NULL, NULL); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 96e69bfc3c8a..51f7b8169515 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -166,8 +166,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } -extern void dl_rebuild_rd_accounting(void); - extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); -- cgit v1.2.3 From 12e766d16814808b6a581597cef6ce9fc029e917 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Mar 2025 11:39:35 +0100 Subject: perf: Fix __percpu annotation With bcecd5a529c1 ("percpu: repurpose __percpu tag as a named address space qualifier") the normal compilers start caring about the __percpu annotation, as such f67d1ffd841f ("perf/core: Detach 'struct perf_cpu_pmu_context' and 'struct pmu' lifetimes") needs a fixup. Fixes: f67d1ffd841f ("perf/core: Detach 'struct perf_cpu_pmu_context' and 'struct pmu' lifetimes") Fixes: bcecd5a529c1 ("percpu: repurpose __percpu tag as a named address space qualifier") Reported-by: Stephen Rothwell Reported-by: jirislaby@kernel.org Signed-off-by: Peter Zijlstra (Intel) --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5b8e3aabac02..63dddb3b54f0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -343,7 +343,7 @@ struct pmu { */ unsigned int scope; - struct perf_cpu_pmu_context __percpu **cpu_pmu_context; + struct perf_cpu_pmu_context * __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; -- cgit v1.2.3 From bba38b874886b227283d56ecb2f7ebbaa01a781d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 19 Feb 2025 14:41:14 +0100 Subject: rtc: pm8xxx: add support for uefi offset On many Qualcomm platforms the PMIC RTC control and time registers are read-only so that the RTC time can not be updated. Instead an offset needs be stored in some machine-specific non-volatile memory, which the driver can take into account. Add support for storing a 32-bit offset from the GPS time epoch in a UEFI variable so that the RTC time can be set on such platforms. The UEFI variable is 882f8c2b-9646-435f-8de5-f208ff80c1bd-RTCInfo and holds a 12-byte structure where the first four bytes is a GPS time offset in little-endian byte order. Note that this format is not arbitrary as the variable is shared with the UEFI firmware (and Windows). Tested-by: Jens Glathe Tested-by: Steev Klimaszewski Tested-by: Joel Stanley Tested-by: Sebastian Reichel # Lenovo T14s Gen6 Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20250219134118.31017-3-johan+linaro@kernel.org Signed-off-by: Alexandre Belloni --- include/linux/rtc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 3f4d315aaec9..95da051fb155 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -170,6 +170,7 @@ struct rtc_device { /* useful timestamps */ #define RTC_TIMESTAMP_BEGIN_0000 -62167219200ULL /* 0000-01-01 00:00:00 */ #define RTC_TIMESTAMP_BEGIN_1900 -2208988800LL /* 1900-01-01 00:00:00 */ +#define RTC_TIMESTAMP_EPOCH_GPS 315964800LL /* 1980-01-06 00:00:00 */ #define RTC_TIMESTAMP_BEGIN_2000 946684800LL /* 2000-01-01 00:00:00 */ #define RTC_TIMESTAMP_END_2063 2966371199LL /* 2063-12-31 23:59:59 */ #define RTC_TIMESTAMP_END_2079 3471292799LL /* 2079-12-31 23:59:59 */ -- cgit v1.2.3 From 023af5a72ab161f2e661afb53e3b6a6901f6ba00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 5 Mar 2025 23:38:48 +0100 Subject: gso: AccECN support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handling the CWR flag differs between RFC 3168 ECN and AccECN. With RFC 3168 ECN aware TSO (NETIF_F_TSO_ECN) CWR flag is cleared starting from 2nd segment which is incompatible how AccECN handles the CWR flag. Such super-segments are indicated by SKB_GSO_TCP_ECN. With AccECN, CWR flag (or more accurately, the ACE field that also includes ECE & AE flags) changes only when new packet(s) with CE mark arrives so the flag should not be changed within a super-skb. The new skb/feature flags are necessary to prevent such TSO engines corrupting AccECN ACE counters by clearing the CWR flag (if the CWR handling feature cannot be turned off). If NIC is completely unaware of RFC3168 ECN (doesn't support NETIF_F_TSO_ECN) or its TSO engine can be set to not touch CWR flag despite supporting also NETIF_F_TSO_ECN, TSO could be safely used with AccECN on such NIC. This should be evaluated per NIC basis (not done in this patch series for any NICs). For the cases, where TSO cannot keep its hands off the CWR flag, a GSO fallback is provided by this patch. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 8 +++++--- include/linux/netdevice.h | 2 ++ include/linux/skbuff.h | 2 ++ 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 11be70a7929f..7a01c518e573 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -53,12 +53,12 @@ enum { NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */ NETIF_F_GSO_UDP_L4_BIT, /* ... UDP payload GSO (not UFO) */ NETIF_F_GSO_FRAGLIST_BIT, /* ... Fraglist GSO */ + NETIF_F_GSO_ACCECN_BIT, /* TCP AccECN w/ TSO (no clear CWR) */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_FRAGLIST_BIT, + NETIF_F_GSO_ACCECN_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ - __UNUSED_NETIF_F_37, NETIF_F_NTUPLE_BIT, /* N-tuple filters supported */ NETIF_F_RXHASH_BIT, /* Receive hashing offload */ NETIF_F_RXCSUM_BIT, /* Receive checksumming offload */ @@ -128,6 +128,7 @@ enum { #define NETIF_F_SG __NETIF_F(SG) #define NETIF_F_TSO6 __NETIF_F(TSO6) #define NETIF_F_TSO_ECN __NETIF_F(TSO_ECN) +#define NETIF_F_GSO_ACCECN __NETIF_F(GSO_ACCECN) #define NETIF_F_TSO __NETIF_F(TSO) #define NETIF_F_VLAN_CHALLENGED __NETIF_F(VLAN_CHALLENGED) #define NETIF_F_RXFCS __NETIF_F(RXFCS) @@ -210,7 +211,8 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID) /* List of features with software fallbacks. */ -#define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP | \ +#define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | \ + NETIF_F_GSO_ACCECN | NETIF_F_GSO_SCTP | \ NETIF_F_GSO_UDP_L4 | NETIF_F_GSO_FRAGLIST) /* diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0dbfe069a6e3..67527243459b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5269,6 +5269,8 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_TCP_ACCECN != + (NETIF_F_GSO_ACCECN >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 14517e95a46c..b8a1343d6785 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -708,6 +708,8 @@ enum { SKB_GSO_UDP_L4 = 1 << 17, SKB_GSO_FRAGLIST = 1 << 18, + + SKB_GSO_TCP_ACCECN = 1 << 19, }; #if BITS_PER_LONG > 32 -- cgit v1.2.3 From f6fc1584f500e0c036190f235b0857c6b05ee0d1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 6 Feb 2025 19:14:24 -0800 Subject: jbd2: remove redundant function jbd2_journal_has_csum_v2or3_feature Since commit dd348f054b24 ("jbd2: switch to using the crc32c library"), jbd2_journal_has_csum_v2or3() and jbd2_journal_has_csum_v2or3_feature() are the same. Remove jbd2_journal_has_csum_v2or3_feature() and just keep jbd2_journal_has_csum_v2or3(). Signed-off-by: Eric Biggers Reviewed-by: Darrick J. Wong Link: https://patch.msgid.link/20250207031424.42755-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 8018d4aecf1b..c211de10e9c8 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1727,14 +1727,10 @@ static inline int tid_geq(tid_t x, tid_t y) extern int jbd2_journal_blocks_per_page(struct inode *inode); extern size_t journal_tag_bytes(journal_t *journal); -static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j) -{ - return jbd2_has_feature_csum2(j) || jbd2_has_feature_csum3(j); -} - static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) { - return jbd2_journal_has_csum_v2or3_feature(journal); + return jbd2_has_feature_csum2(journal) || + jbd2_has_feature_csum3(journal); } static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) -- cgit v1.2.3 From 82d3639ef7dc54d5b5cb454d9a13005202d7a701 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 9 Mar 2025 20:07:42 +0200 Subject: net/mlx5: fs, add support for flow meters HWS action Add support for HW Steering action of flow meter range. Flow meters range can use one HWS action for the whole range. Thus, share a cached HWS action among rules that use same flow meter object range. Hold refcount for each rule using the cached action. Signed-off-by: Moshe Shemesh Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Jacob Keller Link: https://patch.msgid.link/1741543663-22123-3-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index fd62b2b1611d..939e58c2f386 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -244,6 +244,7 @@ void mlx5_destroy_flow_group(struct mlx5_flow_group *fg); struct mlx5_exe_aso { u32 object_id; + int base_id; u8 type; u8 return_reg_id; union { -- cgit v1.2.3 From 43e2aa56aea2e292c209cfd77c3f2d8553fb66e9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 9 Mar 2025 21:04:14 +0100 Subject: net: phy: move PHY package MMD access function declarations from phy.h to phylib.h These functions are used by PHY drivers only, therefore move their declaration to phylib.h. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/406c8a20-b62e-4ee3-b174-b566724a0876@gmail.com Signed-off-by: Paolo Abeni --- include/linux/phy.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index c4a6385faf41..fc028bab10b7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2107,18 +2107,10 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); -int __phy_package_read_mmd(struct phy_device *phydev, - unsigned int addr_offset, int devad, - u32 regnum); - int phy_package_read_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum); -int __phy_package_write_mmd(struct phy_device *phydev, - unsigned int addr_offset, int devad, - u32 regnum, u16 val); - int phy_package_write_mmd(struct phy_device *phydev, unsigned int addr_offset, int devad, u32 regnum, u16 val); -- cgit v1.2.3 From 8ea221b22172a2cc5d0edbfec4b34ef3fe8de167 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 9 Mar 2025 21:05:08 +0100 Subject: net: phy: remove unused functions phy_package_[read|write]_mmd These functions have never had a user, so remove them. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/5792e2cd-6f0a-4f7d-a5ef-b932f94d82f3@gmail.com Signed-off-by: Paolo Abeni --- include/linux/phy.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index fc028bab10b7..61a8cb9d1247 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2107,14 +2107,6 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); -int phy_package_read_mmd(struct phy_device *phydev, - unsigned int addr_offset, int devad, - u32 regnum); - -int phy_package_write_mmd(struct phy_device *phydev, - unsigned int addr_offset, int devad, - u32 regnum, u16 val); - extern const struct bus_type mdio_bus_type; struct mdio_board_info { -- cgit v1.2.3 From 16b1936ae6d1858252413bf4bae8bcf247eb4b4c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 10 Mar 2025 07:49:34 +0000 Subject: lib/rbtree: add random seed Current test use pseudo rand function with fixed seed, which means the test data is the same pattern each time. Add random seed parameter to randomize the test. Link: https://lkml.kernel.org/r/20250310074938.26756-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- include/linux/types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index 1c509ce8f7f6..864ab701f297 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -92,6 +92,7 @@ typedef unsigned char unchar; typedef unsigned short ushort; typedef unsigned int uint; typedef unsigned long ulong; +typedef unsigned long long ullong; #ifndef __BIT_TYPES_DEFINED__ #define __BIT_TYPES_DEFINED__ -- cgit v1.2.3 From 19811285784f7f3d4abaa6e94623f2e7d10219d0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 10 Mar 2025 07:49:37 +0000 Subject: lib/interval_tree: skip the check before go to the right subtree The interval_tree_subtree_search() holds the loop invariant: start <= node->ITSUBTREE Let's say we have a following tree: node / \ left right So we know node->ITSUBTREE is contributed by one of the following: * left->ITSUBTREE * ITLAST(node) * right->ITSUBTREE When we come to the right node, we are sure the first two don't contribute to node->ITSUBTREE and it must be the right node does the job. So skip the check before go to the right subtree. Link: https://lkml.kernel.org/r/20250310074938.26756-7-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- include/linux/interval_tree_generic.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index aaa8a0767aa3..1b400f26f63d 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -104,12 +104,8 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ if (ITSTART(node) <= last) { /* Cond1 */ \ if (start <= ITLAST(node)) /* Cond2 */ \ return node; /* node is leftmost match */ \ - if (node->ITRB.rb_right) { \ - node = rb_entry(node->ITRB.rb_right, \ - ITSTRUCT, ITRB); \ - if (start <= node->ITSUBTREE) \ - continue; \ - } \ + node = rb_entry(node->ITRB.rb_right, ITSTRUCT, ITRB); \ + continue; \ } \ return NULL; /* No match */ \ } \ -- cgit v1.2.3 From 35b862eac9099bb492229627c44e541633fb5938 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 10 Mar 2025 11:10:52 +0000 Subject: net: phylink: expand on .pcs_config() method documentation Expand on the requirements of the .pcs_config() method documentation, specifically mentioning that it should cause minimal disruption to an established link, and that it should return a positive non-zero value when requiring the .pcs_an_restart() method to be called. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1trb24-005oVq-Is@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- include/linux/phylink.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index c187267a15b6..79876c84ae81 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -595,6 +595,14 @@ void pcs_get_state(struct phylink_pcs *pcs, unsigned int neg_mode, * The %neg_mode argument should be tested via the phylink_mode_*() family of * functions, or for PCS that set pcs->neg_mode true, should be tested * against the PHYLINK_PCS_NEG_* definitions. + * + * pcs_config() will be called when configuration of the PCS is required + * or when the advertisement is possibly updated. It must not unnecessarily + * disrupt an established link. + * + * When an autonegotiation restart is required for 802.3z modes, .pcs_config() + * should return a positive non-zero integer (e.g. 1) to indicate to phylink + * to call the pcs_an_restart() method. */ int pcs_config(struct phylink_pcs *pcs, unsigned int neg_mode, phy_interface_t interface, const unsigned long *advertising, -- cgit v1.2.3 From 35a566a24e58f1b5f89737edf60b77de58719ed0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Feb 2025 18:14:26 -0500 Subject: NFSv4: Avoid unnecessary scans of filesystems for returning delegations The amount of looping through the list of delegations is occasionally leading to soft lockups. If the state manager was asked to return delegations asynchronously, it should only scan those filesystems that hold delegations that need to be returned. Fixes: af3b61bf6131 ("NFSv4: Clean up nfs_client_return_marked_delegations()") Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index f00bfcee7120..4e9ad6f6e907 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -250,6 +250,8 @@ struct nfs_server { struct list_head ss_copies; struct list_head ss_src_copies; + unsigned long delegation_flags; +#define NFS4SERV_DELEGRETURN (1) unsigned long delegation_gen; unsigned long mig_gen; unsigned long mig_status; -- cgit v1.2.3 From f163aa81a799e2d46d7f8f0b42a0e7770eaa0d06 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Feb 2025 19:03:21 -0500 Subject: NFSv4: Avoid unnecessary scans of filesystems for expired delegations The amount of looping through the list of delegations is occasionally leading to soft lockups. If the state manager was asked to reap the expired delegations, it should scan only those filesystems that hold delegations that need to be reaped. Fixes: 7f156ef0bf45 ("NFSv4: Clean up nfs_delegation_reap_expired()") Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 4e9ad6f6e907..7d6f164036fa 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -252,6 +252,7 @@ struct nfs_server { unsigned long delegation_flags; #define NFS4SERV_DELEGRETURN (1) +#define NFS4SERV_DELEGATION_EXPIRED (2) unsigned long delegation_gen; unsigned long mig_gen; unsigned long mig_status; -- cgit v1.2.3 From e767b59e29b8327d25edde65efc743f479f30d0a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Feb 2025 18:37:51 -0500 Subject: NFSv4: Avoid unnecessary scans of filesystems for delayed delegations The amount of looping through the list of delegations is occasionally leading to soft lockups. If the state manager was asked to manage the delayed return of delegations, then only scan those filesystems containing delegations that were marked as being delayed. Fixes: be20037725d1 ("NFSv4: Fix delegation return in cases where we have to retry") Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 7d6f164036fa..108862d81b57 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -253,6 +253,7 @@ struct nfs_server { unsigned long delegation_flags; #define NFS4SERV_DELEGRETURN (1) #define NFS4SERV_DELEGATION_EXPIRED (2) +#define NFS4SERV_DELEGRETURN_DELAYED (3) unsigned long delegation_gen; unsigned long mig_gen; unsigned long mig_status; -- cgit v1.2.3 From 8955e7ce61fb6ba28f88e0a38479c992991ba6ab Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 13 Jan 2025 10:32:39 -0500 Subject: NFS: Implement NFSv4.2's OFFLOAD_STATUS XDR Add XDR encoding and decoding functions for the NFSv4.2 OFFLOAD_STATUS operation. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Reviewed-by: Benjamin Coddington Link: https://lore.kernel.org/r/20250113153235.48706-13-cel@kernel.org Signed-off-by: Trond Myklebust --- include/linux/nfs4.h | 1 + include/linux/nfs_xdr.h | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 9ac83ca88326..5fa60fe441b5 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -691,6 +691,7 @@ enum { NFSPROC4_CLNT_LISTXATTRS, NFSPROC4_CLNT_REMOVEXATTR, NFSPROC4_CLNT_READ_PLUS, + NFSPROC4_CLNT_OFFLOAD_STATUS, }; /* nfs41 types */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9155a6ffc370..dc5288111999 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1515,8 +1515,9 @@ struct nfs42_offload_status_args { struct nfs42_offload_status_res { struct nfs4_sequence_res osr_seq_res; - uint64_t osr_count; - int osr_status; + u64 osr_count; + int complete_count; + u32 osr_complete; }; struct nfs42_copy_notify_args { -- cgit v1.2.3 From 77dd8a302f56679178924f82a29eaf437857ea75 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 13 Jan 2025 10:32:40 -0500 Subject: NFS: Implement NFSv4.2's OFFLOAD_STATUS operation Enable the Linux NFS client to observe the progress of an offloaded asynchronous COPY operation. This new operation will be put to use in a subsequent patch. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever Reviewed-by: Benjamin Coddington Link: https://lore.kernel.org/r/20250113153235.48706-14-cel@kernel.org Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 108862d81b57..1711eefcf24f 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -293,6 +293,7 @@ struct nfs_server { #define NFS_CAP_CASE_INSENSITIVE (1U << 6) #define NFS_CAP_CASE_PRESERVING (1U << 7) #define NFS_CAP_REBOOT_LAYOUTRETURN (1U << 8) +#define NFS_CAP_OFFLOAD_STATUS (1U << 9) #define NFS_CAP_OPEN_XOR (1U << 12) #define NFS_CAP_DELEGTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) -- cgit v1.2.3 From 743717670a36b80dfd43896dac5074f8ba5f163f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 6 Mar 2025 14:32:40 +0800 Subject: jbd2: remove jbd2_journal_unfile_buffer() Since the function jbd2_journal_unfile_buffer() is no longer called anywhere after commit e5a120aeb57f ("jbd2: remove journal_head from descriptor buffers"), so let's remove it. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250306063240.157884-1-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index c211de10e9c8..023e8abdb99a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1396,7 +1396,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) */ /* Filing buffers */ -extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); extern bool __jbd2_journal_refile_buffer(struct journal_head *); extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); -- cgit v1.2.3 From e6fa3963a30d53427d33a577c5ba4bfd3373bc93 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:30:59 +1100 Subject: fs/dax: refactor wait for dax idle page A FS DAX page is considered idle when its refcount drops to one. This is currently open-coded in all file systems supporting FS DAX. Move the idle detection to a common function to make future changes easier. Link: https://lkml.kernel.org/r/c2c9d269110b90224eeb1dc661ffbc1d82aa20c9.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Acked-by: Theodore Ts'o Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index df41a0017b31..9b1ce984d410 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -207,6 +207,14 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); +static inline int dax_wait_page_idle(struct page *page, + void (cb)(struct inode *), + struct inode *inode) +{ + return ___wait_var_event(page, page_ref_count(page) == 1, + TASK_INTERRUPTIBLE, 0, 0, cb(inode)); +} + #if IS_ENABLED(CONFIG_DAX) int dax_read_lock(void); void dax_read_unlock(int id); -- cgit v1.2.3 From d5b3afea22a52517e6bc835432e6dfd079b8bf7c Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:00 +1100 Subject: fs/dax: create a common implementation to break DAX layouts Prior to freeing a block file systems supporting FS DAX must check that the associated pages are both unmapped from user-space and not undergoing DMA or other access from eg. get_user_pages(). This is achieved by unmapping the file range and scanning the FS DAX page-cache to see if any pages within the mapping have an elevated refcount. This is done using two functions - dax_layout_busy_page_range() which returns a page to wait for the refcount to become idle on. Rather than open-code this introduce a common implementation to both unmap and wait for the page to become idle. Link: https://lkml.kernel.org/r/c4d381e41fc618296cee2820403c166d80599d5c.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Dan Williams Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 9b1ce984d410..a6b277f1e13a 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -207,12 +207,9 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -static inline int dax_wait_page_idle(struct page *page, - void (cb)(struct inode *), - struct inode *inode) +static inline bool dax_page_is_idle(struct page *page) { - return ___wait_var_event(page, page_ref_count(page) == 1, - TASK_INTERRUPTIBLE, 0, 0, cb(inode)); + return page && page_ref_count(page) == 1; } #if IS_ENABLED(CONFIG_DAX) @@ -228,6 +225,15 @@ static inline void dax_read_unlock(int id) { } #endif /* CONFIG_DAX */ + +#if !IS_ENABLED(CONFIG_FS_DAX) +static inline int __must_check dax_break_layout(struct inode *inode, + loff_t start, loff_t end, void (cb)(struct inode *)) +{ + return 0; +} +#endif + bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, @@ -251,6 +257,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int __must_check dax_break_layout(struct inode *inode, loff_t start, + loff_t end, void (cb)(struct inode *)); +static inline int __must_check dax_break_layout_inode(struct inode *inode, + void (cb)(struct inode *)) +{ + return dax_break_layout(inode, 0, LLONG_MAX, cb); +} int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same, -- cgit v1.2.3 From bde708f1a65d025c45575bfe1e7bf7bdf7e71e87 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:01 +1100 Subject: fs/dax: always remove DAX page-cache entries when breaking layouts Prior to any truncation operations file systems call dax_break_mapping() to ensure pages in the range are not under going DMA. Later DAX page-cache entries will be removed by truncate_folio_batch_exceptionals() in the generic page-cache code. However this makes it possible for folios to be removed from the page-cache even though they are still DMA busy if the file-system hasn't called dax_break_mapping(). It also means they can never be waited on in future because FS DAX will lose track of them once the page-cache entry has been deleted. Instead it is better to delete the FS DAX entry when the file-system calls dax_break_mapping() as part of it's truncate operation. This ensures only idle pages can be removed from the FS DAX page-cache and makes it easy to detect if a file-system hasn't called dax_break_mapping() prior to a truncate operation. Link: https://lkml.kernel.org/r/3be6115eaaa8d28fee37fcba3287be4f226a7d24.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Dan Williams Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index a6b277f1e13a..2fbb262092ca 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -255,6 +255,8 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); +void dax_delete_mapping_range(struct address_space *mapping, + loff_t start, loff_t end); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); int __must_check dax_break_layout(struct inode *inode, loff_t start, -- cgit v1.2.3 From 0e2f80afcfa699ce722c01afc9286a942bd57211 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:02 +1100 Subject: fs/dax: ensure all pages are idle prior to filesystem unmount File systems call dax_break_mapping() prior to reallocating file system blocks to ensure the page is not undergoing any DMA or other accesses. Generally this is needed when a file is truncated to ensure that if a block is reallocated nothing is writing to it. However filesystems currently don't call this when an FS DAX inode is evicted. This can cause problems when the file system is unmounted as a page can continue to be under going DMA or other remote access after unmount. This means if the file system is remounted any truncate or other operation which requires the underlying file system block to be freed will not wait for the remote access to complete. Therefore a busy block may be reallocated to a new file leading to corruption. Link: https://lkml.kernel.org/r/2d3cf575bbd095084993154be2f0aa7442e5cd28.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Wiliams Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 2fbb262092ca..2333c30f6d36 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -232,6 +232,10 @@ static inline int __must_check dax_break_layout(struct inode *inode, { return 0; } + +static inline void dax_break_layout_final(struct inode *inode) +{ +} #endif bool dax_alive(struct dax_device *dax_dev); @@ -266,6 +270,7 @@ static inline int __must_check dax_break_layout_inode(struct inode *inode, { return dax_break_layout(inode, 0, LLONG_MAX, cb); } +void dax_break_layout_final(struct inode *inode); int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same, -- cgit v1.2.3 From cbe298d82cf70aa6bb16cfc8ae4db522f39a0034 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:03 +1100 Subject: fs/dax: remove PAGE_MAPPING_DAX_SHARED mapping flag The page ->mapping pointer can have magic values like PAGE_MAPPING_DAX_SHARED and PAGE_MAPPING_ANON for page owner specific usage. Currently PAGE_MAPPING_DAX_SHARED and PAGE_MAPPING_ANON alias to the same value. This isn't a problem because FS DAX pages are never seen by the anonymous mapping code and vice versa. However a future change will make FS DAX pages more like normal pages, so folio_test_anon() must not return true for a FS DAX page. We could explicitly test for a FS DAX page in folio_test_anon(), etc. however the PAGE_MAPPING_DAX_SHARED flag isn't actually needed. Instead we can use the page->mapping field to implicitly track the first mapping of a page. If page->mapping is non-NULL it implies the page is associated with a single mapping at page->index. If the page is associated with a second mapping clear page->mapping and set page->share to 1. This is possible because a shared mapping implies the file-system implements dax_holder_operations which makes the ->mapping and ->index, which is a union with ->share, unused. The page is considered shared when page->mapping == NULL and page->share > 0 or page->mapping != NULL, implying it is present in at least one address space. This also makes it easier for a future change to detect when a page is first mapped into an address space which requires special handling. Link: https://lkml.kernel.org/r/c22f699202db0acee2f7039eb026e68261ce42d6.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Tested-by: Alison Schofield Cc: Asahi Lina Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Wiliams Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Ted Ts'o Cc: Vishal Verma Cc: WANG Xuerui Cc: Will Deacon Cc: Alexander Gordeev Cc: Balbir Singh Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Jason Gunthorpe Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vivek Goyal Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 36d283552f80..cab382bd965e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -673,12 +673,6 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) -/* - * Different with flags above, this flag is used only for fsdax mode. It - * indicates that this page->mapping is now under reflink case. - */ -#define PAGE_MAPPING_DAX_SHARED ((void *)0x1) - static __always_inline bool folio_mapping_flags(const struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; -- cgit v1.2.3 From 82ba975e4c43d98afebced82d940ddb7aec42a9d Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:06 +1100 Subject: mm: allow compound zone device pages Zone device pages are used to represent various type of device memory managed by device drivers. Currently compound zone device pages are not supported. This is because MEMORY_DEVICE_FS_DAX pages are the only user of higher order zone device pages and have their own page reference counting. A future change will unify FS DAX reference counting with normal page reference counting rules and remove the special FS DAX reference counting. Supporting that requires compound zone device pages. Supporting compound zone device pages requires compound_head() to distinguish between head and tail pages whilst still preserving the special struct page fields that are specific to zone device pages. A tail page is distinguished by having bit zero being set in page->compound_head, with the remaining bits pointing to the head page. For zone device pages page->compound_head is shared with page->pgmap. The page->pgmap field must be common to all pages within a folio, even if the folio spans memory sections. Therefore pgmap is the same for both head and tail pages and can be moved into the folio and we can use the standard scheme to find compound_head from a tail page. Link: https://lkml.kernel.org/r/67055d772e6102accf85161d0b57b0b3944292bf.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Signed-off-by: Balbir Singh Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memremap.h | 6 +++--- include/linux/migrate.h | 4 ++-- include/linux/mm_types.h | 9 +++++++-- include/linux/mmzone.h | 12 +++++++++++- 4 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 3f7143ade32c..0256a4218dc3 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -161,7 +161,7 @@ static inline bool is_device_private_page(const struct page *page) { return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; + page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE; } static inline bool folio_is_device_private(const struct folio *folio) @@ -173,13 +173,13 @@ static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; + page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; } static inline bool is_device_coherent_page(const struct page *page) { return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_COHERENT; + page_pgmap(page)->type == MEMORY_DEVICE_COHERENT; } static inline bool folio_is_device_coherent(const struct folio *folio) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 29919faea2f1..61899ec7a9a3 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -205,8 +205,8 @@ struct migrate_vma { unsigned long end; /* - * Set to the owner value also stored in page->pgmap->owner for - * migrating out of device private memory. The flags also need to + * Set to the owner value also stored in page_pgmap(page)->owner + * for migrating out of device private memory. The flags also need to * be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE. * The caller should always set this field when using mmu notifier * callbacks to avoid device MMU invalidations for device private diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a93abb4452b..0fa7907d437e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -130,8 +130,11 @@ struct page { unsigned long compound_head; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ - /** @pgmap: Points to the hosting device page map. */ - struct dev_pagemap *pgmap; + /* + * The first word is used for compound_head or folio + * pgmap + */ + void *_unused_pgmap_compound_head; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being @@ -300,6 +303,7 @@ typedef struct { * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. + * @pgmap: Metadata for ZONE_DEVICE mappings * @virtual: Virtual address in the kernel direct map. * @_last_cpupid: IDs of last CPU and last process that accessed the folio. * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). @@ -338,6 +342,7 @@ struct folio { /* private: */ }; /* public: */ + struct dev_pagemap *pgmap; }; struct address_space *mapping; pgoff_t index; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 44ecb2f90db4..550dbba92521 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1158,6 +1158,12 @@ static inline bool is_zone_device_page(const struct page *page) return page_zonenum(page) == ZONE_DEVICE; } +static inline struct dev_pagemap *page_pgmap(const struct page *page) +{ + VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page); + return page_folio(page)->pgmap; +} + /* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different @@ -1173,7 +1179,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, return false; if (!is_zone_device_page(a)) return true; - return a->pgmap == b->pgmap; + return page_pgmap(a) == page_pgmap(b); } extern void memmap_init_zone_device(struct zone *, unsigned long, @@ -1188,6 +1194,10 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, { return true; } +static inline struct dev_pagemap *page_pgmap(const struct page *page) +{ + return NULL; +} #endif static inline bool folio_is_zone_device(const struct folio *folio) -- cgit v1.2.3 From ec2e0cc67f9c3ed5926da390ad86b25c142a2e4a Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:08 +1100 Subject: mm/memory: add vmf_insert_page_mkwrite() Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This creates a special devmap PTE entry for the pfn but does not take a reference on the underlying struct page for the mapping. This is because DAX page refcounts are treated specially, as indicated by the presence of a devmap entry. To allow DAX page refcounts to be managed the same as normal page refcounts introduce vmf_insert_page_mkwrite(). This will take a reference on the underlying page much the same as vmf_insert_page, except it also permits upgrading an existing mapping to be writable if requested/possible. Link: https://lkml.kernel.org/r/4ce3aa984c060f370105e0bfef1035869578be47.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Wiliams Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a74a3ee68bc..c6fb9300f6c0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3644,6 +3644,8 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num); +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, + bool write); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, -- cgit v1.2.3 From 349994cf61e6eaa5996d53e45ffd2272d32d5e4e Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:09 +1100 Subject: mm/rmap: add support for PUD sized mappings to rmap The rmap doesn't currently support adding a PUD mapping of a folio. This patch adds support for entire PUD mappings of folios, primarily to allow for more standard refcounting of device DAX folios. Currently DAX is the only user of this and it doesn't require support for partially mapped PUD-sized folios so we don't support for that for now. Link: https://lkml.kernel.org/r/248582c07896e30627d1aeaeebc6949cfd91b851.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Reviewed-by: Dan Williams Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/rmap.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 69e9a431a40e..6abf7960077a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -192,6 +192,7 @@ typedef int __bitwise rmap_t; enum rmap_level { RMAP_LEVEL_PTE = 0, RMAP_LEVEL_PMD, + RMAP_LEVEL_PUD, }; static inline void __folio_rmap_sanity_checks(const struct folio *folio, @@ -228,6 +229,14 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); break; + case RMAP_LEVEL_PUD: + /* + * Assume that we are creating a single "entire" mapping of the + * folio. + */ + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); + break; default: VM_WARN_ON_ONCE(true); } @@ -251,12 +260,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, folio_add_file_rmap_ptes(folio, page, 1, vma) void folio_add_file_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); +void folio_add_file_rmap_pud(struct folio *, struct page *, + struct vm_area_struct *); void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_remove_rmap_pte(folio, page, vma) \ folio_remove_rmap_ptes(folio, page, 1, vma) void folio_remove_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); +void folio_remove_rmap_pud(struct folio *, struct page *, + struct vm_area_struct *); void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); @@ -341,6 +354,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); atomic_inc(&folio->_large_mapcount); break; @@ -437,6 +451,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: + case RMAP_LEVEL_PUD: if (PageAnonExclusive(page)) { if (unlikely(maybe_pinned)) return -EBUSY; -- cgit v1.2.3 From dbe54153296d0931144a63295fc9367794bbe797 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:10 +1100 Subject: mm/huge_memory: add vmf_insert_folio_pud() Currently DAX folio/page reference counts are managed differently to normal pages. To allow these to be managed the same as normal pages introduce vmf_insert_folio_pud. This will map the entire PUD-sized folio and take references as it would for a normally mapped page. This is distinct from the current mechanism, vmf_insert_pfn_pud, which simply inserts a special devmap PUD entry into the page table without holding a reference to the page for the mapping. Link: https://lkml.kernel.org/r/649a1ef91d556593948351e94f51ef73a14f6794.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Dan Williams Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e1bea54820ff..4eeb54b50621 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -39,6 +39,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); +vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, + bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_UNSUPPORTED, -- cgit v1.2.3 From 6c88f72691f88fd1fc493a3c2eed97f91b82b3f0 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:11 +1100 Subject: mm/huge_memory: add vmf_insert_folio_pmd() Currently DAX folio/page reference counts are managed differently to normal pages. To allow these to be managed the same as normal pages introduce vmf_insert_folio_pmd. This will map the entire PMD-sized folio and take references as it would for a normally mapped page. This is distinct from the current mechanism, vmf_insert_pfn_pmd, which simply inserts a special devmap PMD entry into the page table without holding a reference to the page for the mapping. It is not currently useful to implement a more generic vmf_insert_folio() which selects the correct behaviour based on folio_order(). This is because PTE faults require only a subpage of the folio to be PTE mapped rather than the entire folio. It would be possible to add this context somewhere but callers already need to handle PTE faults and PMD faults separately so a more generic function is not useful. Link: https://lkml.kernel.org/r/7bf92a2e68225d13ea368d53bbfee327314d1c40.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Wiliams Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4eeb54b50621..e57e811cfd3c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -39,6 +39,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, + bool write); vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, bool write); -- cgit v1.2.3 From e5cb23256347469c80138a2a8804887239465d0b Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:12 +1100 Subject: mm/gup: don't allow FOLL_LONGTERM pinning of FS DAX pages Longterm pinning of FS DAX pages should already be disallowed by various pXX_devmap checks. However a future change will cause these checks to be invalid for FS DAX pages so make folio_is_longterm_pinnable() return false for FS DAX pages. Link: https://lkml.kernel.org/r/250a31876704b79f7c65b159f3c835e547f052df.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: John Hubbard Reviewed-by: Dan Williams Acked-by: David Hildenbrand Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memremap.h | 11 +++++++++++ include/linux/mm.h | 7 +++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 0256a4218dc3..4aa151914eab 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -187,6 +187,17 @@ static inline bool folio_is_device_coherent(const struct folio *folio) return is_device_coherent_page(&folio->page); } +static inline bool is_fsdax_page(const struct page *page) +{ + return is_zone_device_page(page) && + page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX; +} + +static inline bool folio_is_fsdax(const struct folio *folio) +{ + return is_fsdax_page(&folio->page); +} + #ifdef CONFIG_ZONE_DEVICE void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); diff --git a/include/linux/mm.h b/include/linux/mm.h index c6fb9300f6c0..009484a07074 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2115,6 +2115,13 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio) if (folio_is_device_coherent(folio)) return false; + /* + * Filesystems can only tolerate transient delays to truncate and + * hole-punch operations + */ + if (folio_is_fsdax(folio)) + return false; + /* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio); -- cgit v1.2.3 From 38607c62b34b46317c46d5baf1df03ac6e48a1c6 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:31:14 +1100 Subject: fs/dax: properly refcount fs dax pages Currently fs dax pages are considered free when the refcount drops to one and their refcounts are not increased when mapped via PTEs or decreased when unmapped. This requires special logic in mm paths to detect that these pages should not be properly refcounted, and to detect when the refcount drops to one instead of zero. On the other hand get_user_pages(), etc. will properly refcount fs dax pages by taking a reference and dropping it when the page is unpinned. Tracking this special behaviour requires extra PTE bits (eg. pte_devmap) and introduces rules that are potentially confusing and specific to FS DAX pages. To fix this, and to possibly allow removal of the special PTE bits in future, convert the fs dax page refcounts to be zero based and instead take a reference on the page each time it is mapped as is currently the case for normal pages. This may also allow a future clean-up to remove the pgmap refcounting that is currently done in mm/gup.c. Link: https://lkml.kernel.org/r/c7d886ad7468a20452ef6e0ddab6cfe220874e7c.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Dan Williams Tested-by: Alison Schofield Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Asahi Lina Cc: Balbir Singh Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dax.h | 2 +- include/linux/mm.h | 27 ++------------------------- include/linux/mm_types.h | 7 ++++++- 3 files changed, 9 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 2333c30f6d36..dcc9fcdf14e4 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -209,7 +209,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, static inline bool dax_page_is_idle(struct page *page) { - return page && page_ref_count(page) == 1; + return page && page_ref_count(page) == 0; } #if IS_ENABLED(CONFIG_DAX) diff --git a/include/linux/mm.h b/include/linux/mm.h index 009484a07074..de008efd96aa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1192,6 +1192,8 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; +extern void prep_compound_page(struct page *page, unsigned int order); + /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be @@ -1513,25 +1515,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf); * back into memory. */ -#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) -DECLARE_STATIC_KEY_FALSE(devmap_managed_key); - -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs); -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) -{ - if (!static_branch_unlikely(&devmap_managed_key)) - return false; - if (!folio_is_zone_device(folio)) - return false; - return __put_devmap_managed_folio_refs(folio, refs); -} -#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) -{ - return false; -} -#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ - /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) @@ -1652,12 +1635,6 @@ static inline void put_page(struct page *page) if (folio_test_slab(folio)) return; - /* - * For some devmap managed pages we need to catch refcount transition - * from 2 to 1: - */ - if (put_devmap_managed_folio_refs(folio, 1)) - return; folio_put(folio); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0fa7907d437e..b1827d78ff89 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -296,6 +296,8 @@ typedef struct { * anonymous memory. * @index: Offset within the file, in units of pages. For anonymous memory, * this is the index from the beginning of the mmap. + * @share: number of DAX mappings that reference this folio. See + * dax_associate_entry. * @private: Filesystem per-folio data (see folio_attach_private()). * @swap: Used for swp_entry_t if folio_test_swapcache(). * @_mapcount: Do not access this member directly. Use folio_mapcount() to @@ -345,7 +347,10 @@ struct folio { struct dev_pagemap *pgmap; }; struct address_space *mapping; - pgoff_t index; + union { + pgoff_t index; + unsigned long share; + }; union { void *private; swp_entry_t swap; -- cgit v1.2.3 From 6220ea5583e977e1eeea06c237cc440e5ac3de46 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:54 +0100 Subject: mm: factor out large folio handling from folio_order() into folio_large_order() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: MM owner tracking for large folios (!hugetlb) + CONFIG_NO_PAGE_MAPCOUNT", v3. Let's add an "easy" way to decide -- without false positives, without page-mapcounts and without page table/rmap scanning -- whether a large folio is "certainly mapped exclusively" into a single MM, or whether it "maybe mapped shared" into multiple MMs. Use that information to implement Copy-on-Write reuse, to convert folio_likely_mapped_shared() to folio_maybe_mapped_share(), and to introduce a kernel config option that lets us not use+maintain per-page mapcounts in large folios anymore. The bigger picture was presented at LSF/MM [1]. This series is effectively a follow-up on my early work [2], which implemented a more precise, but also more complicated, way to identify whether a large folio is "mapped shared" into multiple MMs or "mapped exclusively" into a single MM. 1 Patch Organization ==================== Patch #1 -> #6: make more room in order-1 folios, so we have two "unsigned long" available for our purposes Patch #7 -> #11: preparations Patch #12: MM owner tracking for large folios Patch #13: COW reuse for PTE-mapped anon THP Patch #14: folio_maybe_mapped_shared() Patch #15 -> #20: introduce and implement CONFIG_NO_PAGE_MAPCOUNT 2 MM owner tracking =================== We assign each MM a unique ID ("MM ID"), to be able to squeeze more information in our folios. On 32bit we use 15-bit IDs, on 64bit we use 31-bit IDs. For each large folios, we now store two MM-ID+mapcount ("slot") combinations: * mm0_id + mm0_mapcount * mm1_id + mm1_mapcount On 32bit, we use a 16-bit per-MM mapcount, on 64bit an ordinary 32bit mapcount. This way, we require 2x "unsigned long" on 32bit and 64bit for both slots. Paired with the large mapcount, we can reliably identify whether one of these MMs is the current owner (-> owns all mappings) or even holds all folio references (-> owns all mappings, and all references are from mappings). As long as only two MMs map folio pages at a time, we can reliably and precisely identify whether a large folio is "mapped shared" or "mapped exclusively". Any additional MM that starts mapping the folio while there are no free slots becomes an "untracked MM". If one such "untracked MM" is the last one mapping a folio exclusively, we will not detect the folio as "mapped exclusively" but instead as "maybe mapped shared". (exception: only a single mapping remains) So that's where the approach gets imprecise. For now, we use a bit-spinlock to sync the large mapcount + slots, and make sure we do keep the machinery fast, to not degrade (un)map performance drastically: for example, we make sure to only use a single atomic (when grabbing the bit-spinlock), like we would already perform when updating the large mapcount. 3 CONFIG_NO_PAGE_MAPCOUNT ========================= patch #15 -> #20 spell out and document what exactly is affected when not maintaining the per-page mapcounts in large folios anymore. Most importantly, as we cannot maintain folio->_nr_pages_mapped anymore when (un)mapping pages, we'll account a complete folio as mapped if a single page is mapped. In addition, we'll not detect partially mapped anonymous folios as such in all cases yet. Likely less relevant changes include that we might now under-estimate the USS (Unique Set Size) of a process, but never over-estimate it. The goal is to make CONFIG_NO_PAGE_MAPCOUNT the default at some point, to then slowly make it the only option, as we learn about real-life impacts and possible ways to mitigate them. 4 Performance ============= Detailed performance numbers were included in v1 [3], and not that much changed between v1 and v2. I did plenty of measurements on different systems in the meantime, that all revealed slightly different results. The pte-mapped-folio micro-benchmarks [4] are fairly sensitive to code layout changes on some systems. Especially the fork() benchmark started being more-shaky-than-before on recent kernels for some reason. In summary, with my micro-benchmarks: * Small folios are not impacted. * CoW performance seems to be mostly unchanged across all folios sizes. * CoW reuse performance of large folios now matches CoW reuse performance of small folios, because we now actually implement the CoW reuse optimization. On an Intel Xeon Silver 4210R I measured a ~65% reduction in runtime, on an arm64 system I measured ~54% reduction. * munmap() performance improves with CONFIG_NO_PAGE_MAPCOUNT. I saw double-digit % reduction (up to ~30% on an Intel Xeon Silver 4210R and up to ~70% on an AmpereOne A192-32X) with larger folios. The larger the folios, the larger the performance improvement. * munmao() performance very slightly (couple percent) degrades without CONFIG_NO_PAGE_MAPCOUNT for smaller folios. For larger folios, there seems to be no change at all. * fork() performance improves with CONFIG_NO_PAGE_MAPCOUNT. I saw double-digit % reduction (up to ~20% on an Intel Xeon Silver 4210R and up to ~10% on an AmpereOne A192-32X) with larger folios. The larger the folios, the larger the performance improvement. * While fork() performance without CONFIG_NO_PAGE_MAPCOUNT seems to be almost unchanged on some systems, I saw some degradation for smaller folios on the AmpereOne A192-32X. I did not investigate the details yet, but I suspect code layout changes or suboptimal code placement / inlining. I'm not to worried about the fork() micro-benchmarks for smaller folios given how shaky the results are lately and by how much we improved fork() performance recently. I also ran case-anon-cow-rand and case-anon-cow-seq part of vm-scalability, to assess the scalability and the impact of the bit-spinlock. My measurements on a two 2-socket 10-core Intel Xeon Silver 4210R CPU revealed no significant changes. Similarly, running these benchmarks with 2 MiB THPs enabled on the AmpereOne A192-32X with 192 cores, I got < 1% difference with < 1% stdev, which is nice. So far, I did not get my hands on a similarly large system with multiple sockets. I found no other fitting scalability benchmarks that seem to really hammer on concurrent mapping/unmapping of large folio pages like case-anon-cow-seq does. 5 Concerns ========== 5.1 Bit spinlock ---------------- I'm not quite happy about the bit-spinlock, but so far it does not seem to affect scalability in my measurements. If it ever becomes a problem we could either investigate improving the locking, or simply stopping the MM tracking once there are "too many mappings" and simply assume that the folio is "mapped shared" until it was freed. This would be similar (but slightly different) to the "0,1,2,stopped" counting idea Willy had at some point. Adding that logic to "stop tracking" adds more code to the hot path, so I avoided that for now. 5.2 folio_maybe_mapped_shared() ------------------------------- I documented the change from folio_likely_mapped_shared() to folio_maybe_mapped_shared() quite extensively. If we run into surprises, I have some ideas on how to resolve them. For now, I think we should be fine. 5.3 Added code to map/unmap hot path ------------------------------------ So far, it looks like the added code on the rmap hot path does not really seem to matter much in the bigger picture. I'd like to further reduce it (and possibly improve fork() performance further), but I don't easily see how right now. Well, and I am out of puff 🙂 Having that said, alternatives I considered (e.g., per-MM per-folio mapcount) would add a lot more overhead to these hot paths. 6 Future Work ============= 6.1 Large mapcount ------------------ It would be very handy if the large mapcount would count how often folio pages are actually mapped into page tables: a PMD on x86-64 would count 512 times. Calculating the average per-page mapcount will be easy, and remapping (PMD->PTE) folios would get even faster. That would also remove the need for the entire mapcount (except for PMD-sized folios for memory statistics reasons ...), and allow for mapping folios larger than PMDs (e.g., 4 MiB) easily. We likely would also have to take the same number of folio references to make our folio_mapcount() == folio_ref_count() work, and we'd want to be able to avoid mapcount+refcount overflows: this could already become an issue with pte-mapped PUD-sized folios (fsdax). One approach we discussed in the THP cabal meeting is (1) extending the mapcount for large folios to 64bit (at least on 64bit systems) and (2) keeping the refcount at 32bit, but (3) having exactly one reference if the the mapcount != 0. It should be doable, but there are some corner cases to consider on the unmap path; it is something that I will be looking into next. 6.2 hugetlb ----------- I'd love to make use of the same tracking also for hugetlb. The real problem is PMD table sharing: getting a page mapped by MM X and unmapped by MM Y will not work. With mshare, that problem should not exist (all mapping/unmapping will be routed through the mshare MM). [1] https://lwn.net/Articles/974223/ [2] https://lore.kernel.org/linux-mm/a9922f58-8129-4f15-b160-e0ace581bcbe@redhat.com/T/ [3] https://lkml.kernel.org/r/20240829165627.2256514-1-david@redhat.com [4] https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/pte-mapped-folio-benchmarks.c This patch (of 20): Let's factor it out into a simple helper function. This helper will also come in handy when working with code where we know that our folio is large. Maybe in the future we'll have the order readily available for small and large folios; in that case, folio_large_order() would simply translate to folio_order(). Link: https://lkml.kernel.org/r/20250303163014.1128035-1-david@redhat.com Link: https://lkml.kernel.org/r/20250303163014.1128035-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lance Yang Reviewed-by: Kirill A. Shutemov Cc: Thomas Gleixner Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: David Hildenbrand Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index de008efd96aa..e5cdbb94ef81 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1194,6 +1194,11 @@ struct inode; extern void prep_compound_page(struct page *page, unsigned int order); +static inline unsigned int folio_large_order(const struct folio *folio) +{ + return folio->_flags_1 & 0xff; +} + /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be @@ -1207,7 +1212,7 @@ static inline unsigned int compound_order(struct page *page) if (!test_bit(PG_head, &folio->flags)) return 0; - return folio->_flags_1 & 0xff; + return folio_large_order(folio); } /** @@ -1223,7 +1228,7 @@ static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; - return folio->_flags_1 & 0xff; + return folio_large_order(folio); } #include @@ -2145,7 +2150,7 @@ static inline long folio_nr_pages(const struct folio *folio) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << (folio->_flags_1 & 0xff); + return 1L << folio_large_order(folio); #endif } @@ -2170,7 +2175,7 @@ static inline unsigned long compound_nr(struct page *page) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << (folio->_flags_1 & 0xff); + return 1L << folio_large_order(folio); #endif } -- cgit v1.2.3 From 1ea5212aed0682ae1249cba9df7ad7ef539d0a7d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:55 +0100 Subject: mm: factor out large folio handling from folio_nr_pages() into folio_large_nr_pages() Let's factor it out into a simple helper function. This helper will also come in handy when working with code where we know that our folio is large. While at it, let's consistently return a "long" value from all these similar functions. Note that we cannot use "unsigned int" (even though _folio_nr_pages is of that type), because it would break some callers that do stuff like "-folio_nr_pages()". Both "int" or "unsigned long" would work as well. Maybe in the future we'll have the nr_pages readily available for all large folios, maybe even for small folios, or maybe for none. Link: https://lkml.kernel.org/r/20250303163014.1128035-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Kirill A. Shutemov Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e5cdbb94ef81..cf892c08a88c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1199,6 +1199,18 @@ static inline unsigned int folio_large_order(const struct folio *folio) return folio->_flags_1 & 0xff; } +#ifdef CONFIG_64BIT +static inline long folio_large_nr_pages(const struct folio *folio) +{ + return folio->_folio_nr_pages; +} +#else +static inline long folio_large_nr_pages(const struct folio *folio) +{ + return 1L << folio_large_order(folio); +} +#endif + /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be @@ -2147,11 +2159,7 @@ static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; -#ifdef CONFIG_64BIT - return folio->_folio_nr_pages; -#else - return 1L << folio_large_order(folio); -#endif + return folio_large_nr_pages(folio); } /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ @@ -2166,24 +2174,20 @@ static inline long folio_nr_pages(const struct folio *folio) * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline unsigned long compound_nr(struct page *page) +static inline long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 1; -#ifdef CONFIG_64BIT - return folio->_folio_nr_pages; -#else - return 1L << folio_large_order(folio); -#endif + return folio_large_nr_pages(folio); } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ -static inline int thp_nr_pages(struct page *page) +static inline long thp_nr_pages(struct page *page) { return folio_nr_pages((struct folio *)page); } -- cgit v1.2.3 From 4996fc547f5b49f4a43c261dfadb02cf165cdb51 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:56 +0100 Subject: mm: let _folio_nr_pages overlay memcg_data in first tail page Let's free up some more of the "unconditionally available on 64BIT" space in order-1 folios by letting _folio_nr_pages overlay memcg_data in the first tail page (second folio page). Consequently, we have the optimization now whenever we have CONFIG_MEMCG, independent of 64BIT. We have to make sure that page->memcg on tail pages does not return "surprises". page_memcg_check() already properly refuses PageTail(). Let's do that earlier in print_page_owner_memcg() to avoid printing wrong "Slab cache page" information. No other code should touch that field on tail pages of compound pages. Reset the "_nr_pages" to 0 when splitting folios, or when freeing them back to the buddy (to avoid false page->memcg_data "bad page" reports). Note that in __split_huge_page(), folio_nr_pages() would stop working already as soon as we start messing with the subpages. Most kernel configs should have at least CONFIG_MEMCG enabled, even if disabled at runtime. 64byte "struct memmap" is what we usually have on 64BIT. While at it, rename "_folio_nr_pages" to "_nr_pages". Hopefully memdescs / dynamically allocating "strut folio" in the future will further clean this up, e.g., making _nr_pages available in all configs and maybe even in small folios. Doing that should be fairly easy on top of this change. [david@redhat.com: make "make htmldoc" happy] Link: https://lkml.kernel.org/r/a97f8a91-ec41-4796-81e3-7c9e0e491ba4@redhat.com Link: https://lkml.kernel.org/r/20250303163014.1128035-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Kirill A. Shutemov Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- include/linux/mm_types.h | 32 ++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cf892c08a88c..c9c2ca345350 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1199,10 +1199,10 @@ static inline unsigned int folio_large_order(const struct folio *folio) return folio->_flags_1 & 0xff; } -#ifdef CONFIG_64BIT +#ifdef NR_PAGES_IN_LARGE_FOLIO static inline long folio_large_nr_pages(const struct folio *folio) { - return folio->_folio_nr_pages; + return folio->_nr_pages; } #else static inline long folio_large_nr_pages(const struct folio *folio) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b1827d78ff89..f3b519fbeca1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -287,6 +287,11 @@ typedef struct { unsigned long val; } swp_entry_t; +#if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT) +/* We have some extra room after the refcount in tail pages. */ +#define NR_PAGES_IN_LARGE_FOLIO +#endif + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. @@ -312,7 +317,7 @@ typedef struct { * @_large_mapcount: Do not use directly, call folio_mapcount(). * @_nr_pages_mapped: Do not use outside of rmap and debug code. * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). - * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). + * @_nr_pages: Do not use directly, call folio_nr_pages(). * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. @@ -376,14 +381,23 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + union { + struct { /* public: */ - atomic_t _large_mapcount; - atomic_t _entire_mapcount; - atomic_t _nr_pages_mapped; - atomic_t _pincount; -#ifdef CONFIG_64BIT - unsigned int _folio_nr_pages; -#endif + atomic_t _large_mapcount; + atomic_t _entire_mapcount; + atomic_t _nr_pages_mapped; + atomic_t _pincount; + /* private: the union with struct page is transitional */ + }; + unsigned long _usable_1[4]; + }; + atomic_t _mapcount_1; + atomic_t _refcount_1; + /* public: */ +#ifdef NR_PAGES_IN_LARGE_FOLIO + unsigned int _nr_pages; +#endif /* NR_PAGES_IN_LARGE_FOLIO */ /* private: the union with struct page is transitional */ }; struct page __page_1; @@ -435,6 +449,8 @@ FOLIO_MATCH(_last_cpupid, _last_cpupid); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); +FOLIO_MATCH(_mapcount, _mapcount_1); +FOLIO_MATCH(_refcount, _refcount_1); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ -- cgit v1.2.3 From 4eeec8c89a0c4a8c20fb13a4e7093cc8efce383d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:57 +0100 Subject: mm: move hugetlb specific things in folio to page[3] Let's just move the hugetlb specific stuff to a separate page, and stop letting it overlay other fields for now. This frees up some space in page[2], which we will use on 32bit to free up some space in page[1]. While we could move these things to page[3] instead, it's cleaner to just move the hugetlb specific things out of the way and pack the core-folio stuff as tight as possible. ... and we can minimize the work required in dump_folio. We can now avoid re-initializing &folio->_deferred_list in hugetlb code. Hopefully dynamically allocating "strut folio" in the future will further clean this up. Link: https://lkml.kernel.org/r/20250303163014.1128035-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f3b519fbeca1..727322ecbfdd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -407,20 +407,23 @@ struct folio { unsigned long _flags_2; unsigned long _head_2; /* public: */ - void *_hugetlb_subpool; - void *_hugetlb_cgroup; - void *_hugetlb_cgroup_rsvd; - void *_hugetlb_hwpoison; + struct list_head _deferred_list; /* private: the union with struct page is transitional */ }; + struct page __page_2; + }; + union { struct { - unsigned long _flags_2a; - unsigned long _head_2a; + unsigned long _flags_3; + unsigned long _head_3; /* public: */ - struct list_head _deferred_list; + void *_hugetlb_subpool; + void *_hugetlb_cgroup; + void *_hugetlb_cgroup_rsvd; + void *_hugetlb_hwpoison; /* private: the union with struct page is transitional */ }; - struct page __page_2; + struct page __page_3; }; }; @@ -457,8 +460,12 @@ FOLIO_MATCH(_refcount, _refcount_1); offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); -FOLIO_MATCH(flags, _flags_2a); -FOLIO_MATCH(compound_head, _head_2a); +#undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + 3 * sizeof(struct page)) +FOLIO_MATCH(flags, _flags_3); +FOLIO_MATCH(compound_head, _head_3); #undef FOLIO_MATCH /** -- cgit v1.2.3 From 31a31da8a6187f1e5448ec73222e01d7d3fed4aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:58 +0100 Subject: mm: move _pincount in folio to page[2] on 32bit Let's free up some space on 32bit in page[1] by moving the _pincount to page[2]. For order-1 folios (never anon folios!) on 32bit, we will now also use the GUP_PIN_COUNTING_BIAS approach. A fully-mapped order-1 folio requires 2 references. With GUP_PIN_COUNTING_BIAS being 1024, we'd detect such folios as "maybe pinned" with 512 full mappings, instead of 1024 for order-0. As anon folios are out of the picture (which are the most relevant users of checking for pinnings on *mapped* pages) and we are talking about 32bit, this is not expected to cause any trouble. In __dump_page(), copy one additional folio page if we detect a folio with an order > 1, so we can dump the pincount on order > 1 folios reliably. Note that THPs on 32bit are not particularly common (and we don't care too much about performance), but we want to keep it working reliably, because likely we want to use large folios there as well in the future, independent of PMD leaf support. Once we dynamically allocate "struct folio", fortunately the 32bit specifics will likely go away again; even small folios could then have a pincount and folio_has_pincount() would essentially always return "true". Link: https://lkml.kernel.org/r/20250303163014.1128035-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 +++++++++-- include/linux/mm_types.h | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c9c2ca345350..860082ba8978 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2010,6 +2010,13 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } +static inline bool folio_has_pincount(const struct folio *folio) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return folio_test_large(folio); + return folio_order(folio) > 1; +} + /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. @@ -2026,7 +2033,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * - * For large folios, the result will be exactly correct. That's because + * For most large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * @@ -2037,7 +2044,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { - if (folio_test_large(folio)) + if (folio_has_pincount(folio)) return atomic_read(&folio->_pincount) > 0; /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 727322ecbfdd..3ea2019a1aac 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -387,7 +387,9 @@ struct folio { atomic_t _large_mapcount; atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; +#ifdef CONFIG_64BIT atomic_t _pincount; +#endif /* CONFIG_64BIT */ /* private: the union with struct page is transitional */ }; unsigned long _usable_1[4]; @@ -408,6 +410,9 @@ struct folio { unsigned long _head_2; /* public: */ struct list_head _deferred_list; +#ifndef CONFIG_64BIT + atomic_t _pincount; +#endif /* !CONFIG_64BIT */ /* private: the union with struct page is transitional */ }; struct page __page_2; -- cgit v1.2.3 From 845d2be6d41f016da670dcc4c8f5357c22172be8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:29:59 +0100 Subject: mm: move _entire_mapcount in folio to page[2] on 32bit Let's free up some space on 32bit in page[1] by moving the _pincount to page[2]. Ordinary folios only use the entire mapcount with PMD mappings, so order-1 folios don't apply. Similarly, hugetlb folios are always larger than order-1, turning the entire mapcount essentially unused for all order-1 folios. Moving it to order-1 folios will not change anything. On 32bit, simply check in folio_entire_mapcount() whether we have an order-1 folio, and return 0 in that case. Note that THPs on 32bit are not particularly common (and we don't care too much about performance), but we want to keep it working reliably, because likely we want to use large folios there as well in the future, independent of PMD leaf support. Once we dynamically allocate "struct folio", the 32bit specifics will go away again; even small folios could then have a pincount. Link: https://lkml.kernel.org/r/20250303163014.1128035-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ include/linux/mm_types.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 860082ba8978..f366c180f2b6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1333,6 +1333,8 @@ static inline int is_vmalloc_or_module_addr(const void *x) static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) + return 0; return atomic_read(&folio->_entire_mapcount) + 1; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3ea2019a1aac..9499eb8e8e66 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -385,9 +385,9 @@ struct folio { struct { /* public: */ atomic_t _large_mapcount; - atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; #ifdef CONFIG_64BIT + atomic_t _entire_mapcount; atomic_t _pincount; #endif /* CONFIG_64BIT */ /* private: the union with struct page is transitional */ @@ -411,6 +411,7 @@ struct folio { /* public: */ struct list_head _deferred_list; #ifndef CONFIG_64BIT + atomic_t _entire_mapcount; atomic_t _pincount; #endif /* !CONFIG_64BIT */ /* private: the union with struct page is transitional */ -- cgit v1.2.3 From 405c4ef769c7c5e83e3556b48a190fe1afe82425 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:00 +0100 Subject: mm/rmap: pass dst_vma to folio_dup_file_rmap_pte() and friends We'll need access to the destination MM when modifying the large mapcount of a non-hugetlb large folios next. So pass in the destination VMA. Link: https://lkml.kernel.org/r/20250303163014.1128035-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/rmap.h | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6abf7960077a..e795610bade8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -335,7 +335,8 @@ static inline void hugetlb_remove_rmap(struct folio *folio) } static __always_inline void __folio_dup_file_rmap(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma, + enum rmap_level level) { const int orig_nr_pages = nr_pages; @@ -366,45 +367,47 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, * @folio: The folio to duplicate the mappings of * @page: The first page to duplicate the mappings of * @nr_pages: The number of pages of which the mapping will be duplicated + * @dst_vma: The destination vm area * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ static inline void folio_dup_file_rmap_ptes(struct folio *folio, - struct page *page, int nr_pages) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); } static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, - struct page *page) + struct page *page, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, 1, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); } /** * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio * @folio: The folio to duplicate the mapping of * @page: The first page to duplicate the mapping of + * @dst_vma: The destination vm area * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ static inline void folio_dup_file_rmap_pmd(struct folio *folio, - struct page *page) + struct page *page, struct vm_area_struct *dst_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); #else WARN_ON_ONCE(true); #endif } static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, - struct page *page, int nr_pages, struct vm_area_struct *src_vma, - enum rmap_level level) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, enum rmap_level level) { const int orig_nr_pages = nr_pages; bool maybe_pinned; @@ -470,6 +473,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * @folio: The folio to duplicate the mappings of * @page: The first page to duplicate the mappings of * @nr_pages: The number of pages of which the mapping will be duplicated + * @dst_vma: The destination vm area * @src_vma: The vm area from which the mappings are duplicated * * The page range of the folio is defined by [page, page + nr_pages) @@ -488,16 +492,18 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. */ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, - struct page *page, int nr_pages, struct vm_area_struct *src_vma) + struct page *page, int nr_pages, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { - return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma, - RMAP_LEVEL_PTE); + return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, + src_vma, RMAP_LEVEL_PTE); } static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, - struct page *page, struct vm_area_struct *src_vma) + struct page *page, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { - return __folio_try_dup_anon_rmap(folio, page, 1, src_vma, + return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, RMAP_LEVEL_PTE); } @@ -506,6 +512,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, * of a folio * @folio: The folio to duplicate the mapping of * @page: The first page to duplicate the mapping of + * @dst_vma: The destination vm area * @src_vma: The vm area from which the mapping is duplicated * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) @@ -524,11 +531,12 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. */ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, - struct page *page, struct vm_area_struct *src_vma) + struct page *page, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma, - RMAP_LEVEL_PMD); + return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, + src_vma, RMAP_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; -- cgit v1.2.3 From 932961c4b666937e27f7ec5358fb7aabf0f17d41 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:02 +0100 Subject: mm/rmap: abstract large mapcount operations for large folios (!hugetlb) Let's abstract the operations so we can extend these operations easily. Link: https://lkml.kernel.org/r/20250303163014.1128035-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/rmap.h | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e795610bade8..d1e888cc97a5 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -173,6 +173,30 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *folio_get_anon_vma(const struct folio *folio); +static inline void folio_set_large_mapcount(struct folio *folio, int mapcount, + struct vm_area_struct *vma) +{ + /* Note: mapcounts start at -1. */ + atomic_set(&folio->_large_mapcount, mapcount - 1); +} + +static inline void folio_add_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + atomic_add(diff, &folio->_large_mapcount); +} + +static inline void folio_sub_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + atomic_sub(diff, &folio->_large_mapcount); +} + +#define folio_inc_large_mapcount(folio, vma) \ + folio_add_large_mapcount(folio, 1, vma) +#define folio_dec_large_mapcount(folio, vma) \ + folio_sub_large_mapcount(folio, 1, vma) + /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; @@ -352,12 +376,12 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, do { atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); - atomic_add(orig_nr_pages, &folio->_large_mapcount); + folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case RMAP_LEVEL_PMD: case RMAP_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); - atomic_inc(&folio->_large_mapcount); + folio_inc_large_mapcount(folio, dst_vma); break; } } @@ -451,7 +475,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, ClearPageAnonExclusive(page); atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); - atomic_add(orig_nr_pages, &folio->_large_mapcount); + folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case RMAP_LEVEL_PMD: case RMAP_LEVEL_PUD: @@ -461,7 +485,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, ClearPageAnonExclusive(page); } atomic_inc(&folio->_entire_mapcount); - atomic_inc(&folio->_large_mapcount); + folio_inc_large_mapcount(folio, dst_vma); break; } return 0; -- cgit v1.2.3 From b85aa9b11b67ed4b086bf5366392adec1b4a6a81 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:03 +0100 Subject: bit_spinlock: __always_inline (un)lock functions The compiler might decide that it is a smart idea to not inline bit_spin_lock(), primarily when a couple of functions in the same file end up calling it. Especially when used in RMAP map/unmap code next, the compiler sometimes decides to not inline, which is then observable in some micro-benchmarks. Let's simply flag all lock/unlock functions as __always_inline; arch_test_and_set_bit_lock() and friends are already tagged like that (but not test_and_set_bit_lock() for some reason). If ever a problem, we could split it into a fast and a slow path, and only force the fast path to be inlined. But there is nothing particularly "big" here. Link: https://lkml.kernel.org/r/20250303163014.1128035-11-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/bit_spinlock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index bbc4730a6505..c0989b5b0407 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -13,7 +13,7 @@ * Don't use this unless you really need to: spin_lock() and spin_unlock() * are significantly faster. */ -static inline void bit_spin_lock(int bitnum, unsigned long *addr) +static __always_inline void bit_spin_lock(int bitnum, unsigned long *addr) { /* * Assuming the lock is uncontended, this never enters @@ -38,7 +38,7 @@ static inline void bit_spin_lock(int bitnum, unsigned long *addr) /* * Return true if it was acquired */ -static inline int bit_spin_trylock(int bitnum, unsigned long *addr) +static __always_inline int bit_spin_trylock(int bitnum, unsigned long *addr) { preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) @@ -54,7 +54,7 @@ static inline int bit_spin_trylock(int bitnum, unsigned long *addr) /* * bit-based spin_unlock() */ -static inline void bit_spin_unlock(int bitnum, unsigned long *addr) +static __always_inline void bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); @@ -71,7 +71,7 @@ static inline void bit_spin_unlock(int bitnum, unsigned long *addr) * non-atomic version, which can be used eg. if the bit lock itself is * protecting the rest of the flags in the word. */ -static inline void __bit_spin_unlock(int bitnum, unsigned long *addr) +static __always_inline void __bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); -- cgit v1.2.3 From 6af8cb80d3a9a6bbd521d8a7c949b4eafb7dba5d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:05 +0100 Subject: mm/rmap: basic MM owner tracking for large folios (!hugetlb) For small folios, we traditionally use the mapcount to decide whether it was "certainly mapped exclusively" by a single MM (mapcount == 1) or whether it "maybe mapped shared" by multiple MMs (mapcount > 1). For PMD-sized folios that were PMD-mapped, we were able to use a similar mechanism (single PMD mapping), but for PTE-mapped folios and in the future folios that span multiple PMDs, this does not work. So we need a different mechanism to handle large folios. Let's add a new mechanism to detect whether a large folio is "certainly mapped exclusively", or whether it is "maybe mapped shared". We'll use this information next to optimize CoW reuse for PTE-mapped anonymous THP, and to convert folio_likely_mapped_shared() to folio_maybe_mapped_shared(), independent of per-page mapcounts. For each large folio, we'll have two slots, whereby a slot stores: (1) an MM id: unique id assigned to each MM (2) a per-MM mapcount If a slot is unoccupied, it can be taken by the next MM that maps folio page. In addition, we'll remember the current state -- "mapped exclusively" vs. "maybe mapped shared" -- and use a bit spinlock to sync on updates and to reduce the total number of atomic accesses on updates. In the future, it might be possible to squeeze a proper spinlock into "struct folio". For now, keep it simple, as we require the whole thing with THP only, that is incompatible with RT. As we have to squeeze this information into the "struct folio" of even folios of order-1 (2 pages), and we generally want to reduce the required metadata, we'll assign each MM a unique ID that can fit into an int. In total, we can squeeze everything into 4x int (2x long) on 64bit. 32bit support is a bit challenging, because we only have 2x long == 2x int in order-1 folios. But we can make it work for now, because we neither expect many MMs nor very large folios on 32bit. We will reliably detect folios as "mapped exclusively" vs. "mapped shared" as long as only two MMs map pages of a folio at one point in time -- for example with fork() and short-lived child processes, or with apps that hand over state from one instance to another. As soon as three MMs are involved at the same time, we might detect "maybe mapped shared" although the folio is "mapped exclusively". Example 1: (1) App1 faults in a (shmem/file-backed) folio page -> Tracked as MM0 (2) App2 faults in a folio page -> Tracked as MM1 (4) App1 unmaps all folio pages -> We will detect "mapped exclusively". Example 2: (1) App1 faults in a (shmem/file-backed) folio page -> Tracked as MM0 (2) App2 faults in a folio page -> Tracked as MM1 (3) App3 faults in a folio page -> No slot available, tracked as "unknown" (4) App1 and App2 unmap all folio pages -> We will detect "maybe mapped shared". Make use of __always_inline to keep possible performance degradation when (un)mapping large folios to a minimum. Note: by squeezing the two flags into the "unsigned long" that stores the MM ids, we can use non-atomic __bit_spin_unlock() and non-atomic setting/clearing of the "maybe mapped shared" bit, effectively not adding any new atomics on the hot path when updating the large mapcount + new metadata, which further helps reduce the runtime overhead in micro-benchmarks. Link: https://lkml.kernel.org/r/20250303163014.1128035-13-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 49 ++++++++++++++ include/linux/page-flags.h | 4 ++ include/linux/rmap.h | 165 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 218 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9499eb8e8e66..aac7c87b04e1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -292,6 +292,44 @@ typedef struct { #define NR_PAGES_IN_LARGE_FOLIO #endif +/* + * On 32bit, we can cut the required metadata in half, because: + * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have, + * so we can limit MM IDs to 15 bit (32767). + * (b) We don't expect folios where even a single complete PTE mapping by + * one MM would exceed 15 bits (order-15). + */ +#ifdef CONFIG_64BIT +typedef int mm_id_mapcount_t; +#define MM_ID_MAPCOUNT_MAX INT_MAX +typedef unsigned int mm_id_t; +#else /* !CONFIG_64BIT */ +typedef short mm_id_mapcount_t; +#define MM_ID_MAPCOUNT_MAX SHRT_MAX +typedef unsigned short mm_id_t; +#endif /* CONFIG_64BIT */ + +/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */ +#define MM_ID_DUMMY 0 +#define MM_ID_MIN (MM_ID_DUMMY + 1) + +/* + * We leave the highest bit of each MM id unused, so we can store a flag + * in the highest bit of each folio->_mm_id[]. + */ +#define MM_ID_BITS ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1) +#define MM_ID_MASK ((1U << MM_ID_BITS) - 1) +#define MM_ID_MAX MM_ID_MASK + +/* + * In order to use bit_spin_lock(), which requires an unsigned long, we + * operate on folio->_mm_ids when working on flags. + */ +#define FOLIO_MM_IDS_LOCK_BITNUM MM_ID_BITS +#define FOLIO_MM_IDS_LOCK_BIT BIT(FOLIO_MM_IDS_LOCK_BITNUM) +#define FOLIO_MM_IDS_SHARED_BITNUM (2 * MM_ID_BITS + 1) +#define FOLIO_MM_IDS_SHARED_BIT BIT(FOLIO_MM_IDS_SHARED_BITNUM) + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. @@ -318,6 +356,9 @@ typedef struct { * @_nr_pages_mapped: Do not use outside of rmap and debug code. * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_nr_pages: Do not use directly, call folio_nr_pages(). + * @_mm_id: Do not use outside of rmap code. + * @_mm_ids: Do not use outside of rmap code. + * @_mm_id_mapcount: Do not use outside of rmap code. * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. @@ -390,6 +431,11 @@ struct folio { atomic_t _entire_mapcount; atomic_t _pincount; #endif /* CONFIG_64BIT */ + mm_id_mapcount_t _mm_id_mapcount[2]; + union { + mm_id_t _mm_id[2]; + unsigned long _mm_ids; + }; /* private: the union with struct page is transitional */ }; unsigned long _usable_1[4]; @@ -1114,6 +1160,9 @@ struct mm_struct { #endif } lru_gen; #endif /* CONFIG_LRU_GEN_WALKS_MMU */ +#ifdef CONFIG_MM_ID + mm_id_t mm_id; +#endif /* CONFIG_MM_ID */ } __randomize_layout; /* diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cab382bd965e..f26ce54c7aa4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1185,6 +1185,10 @@ static inline int folio_has_private(const struct folio *folio) return !!(folio->flags & PAGE_FLAGS_PRIVATE); } +static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio) +{ + return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); +} #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d1e888cc97a5..c131b0efff0f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -13,6 +13,7 @@ #include #include #include +#include /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -173,6 +174,169 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, struct anon_vma *folio_get_anon_vma(const struct folio *folio); +#ifdef CONFIG_MM_ID +static __always_inline void folio_lock_large_mapcount(struct folio *folio) +{ + bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); +} + +static __always_inline void folio_unlock_large_mapcount(struct folio *folio) +{ + __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); +} + +static inline unsigned int folio_mm_id(const struct folio *folio, int idx) +{ + VM_WARN_ON_ONCE(idx != 0 && idx != 1); + return folio->_mm_id[idx] & MM_ID_MASK; +} + +static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id) +{ + VM_WARN_ON_ONCE(idx != 0 && idx != 1); + folio->_mm_id[idx] &= ~MM_ID_MASK; + folio->_mm_id[idx] |= id; +} + +static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio, + int diff, mm_id_t mm_id) +{ + VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio)); + VM_WARN_ON_ONCE(diff <= 0); + VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX); + + /* + * Make sure we can detect at least one complete PTE mapping of the + * folio in a single MM as "exclusively mapped". This is primarily + * a check on 32bit, where we currently reduce the size of the per-MM + * mapcount to a short. + */ + VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio)); + VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX); + + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY && + folio->_mm_id_mapcount[0] != -1); + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY && + folio->_mm_id_mapcount[0] < 0); + VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY && + folio->_mm_id_mapcount[1] != -1); + VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY && + folio->_mm_id_mapcount[1] < 0); + VM_WARN_ON_ONCE(!folio_mapped(folio) && + folio_test_large_maybe_mapped_shared(folio)); +} + +static __always_inline void folio_set_large_mapcount(struct folio *folio, + int mapcount, struct vm_area_struct *vma) +{ + __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id); + + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY); + VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY); + + /* Note: mapcounts start at -1. */ + atomic_set(&folio->_large_mapcount, mapcount - 1); + folio->_mm_id_mapcount[0] = mapcount - 1; + folio_set_mm_id(folio, 0, vma->vm_mm->mm_id); +} + +static __always_inline void folio_add_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + const mm_id_t mm_id = vma->vm_mm->mm_id; + int new_mapcount_val; + + folio_lock_large_mapcount(folio); + __folio_large_mapcount_sanity_checks(folio, diff, mm_id); + + new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff; + atomic_set(&folio->_large_mapcount, new_mapcount_val); + + /* + * If a folio is mapped more than once into an MM on 32bit, we + * can in theory overflow the per-MM mapcount (although only for + * fairly large folios), turning it negative. In that case, just + * free up the slot and mark the folio "mapped shared", otherwise + * we might be in trouble when unmapping pages later. + */ + if (folio_mm_id(folio, 0) == mm_id) { + folio->_mm_id_mapcount[0] += diff; + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) { + folio->_mm_id_mapcount[0] = -1; + folio_set_mm_id(folio, 0, MM_ID_DUMMY); + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } + } else if (folio_mm_id(folio, 1) == mm_id) { + folio->_mm_id_mapcount[1] += diff; + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) { + folio->_mm_id_mapcount[1] = -1; + folio_set_mm_id(folio, 1, MM_ID_DUMMY); + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } + } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) { + folio_set_mm_id(folio, 0, mm_id); + folio->_mm_id_mapcount[0] = diff - 1; + /* We might have other mappings already. */ + if (new_mapcount_val != diff - 1) + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) { + folio_set_mm_id(folio, 1, mm_id); + folio->_mm_id_mapcount[1] = diff - 1; + /* Slot 0 certainly has mappings as well. */ + folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; + } + folio_unlock_large_mapcount(folio); +} + +static __always_inline void folio_sub_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + const mm_id_t mm_id = vma->vm_mm->mm_id; + int new_mapcount_val; + + folio_lock_large_mapcount(folio); + __folio_large_mapcount_sanity_checks(folio, diff, mm_id); + + new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff; + atomic_set(&folio->_large_mapcount, new_mapcount_val); + + /* + * There are valid corner cases where we might underflow a per-MM + * mapcount (some mappings added when no slot was free, some mappings + * added once a slot was free), so we always set it to -1 once we go + * negative. + */ + if (folio_mm_id(folio, 0) == mm_id) { + folio->_mm_id_mapcount[0] -= diff; + if (folio->_mm_id_mapcount[0] >= 0) + goto out; + folio->_mm_id_mapcount[0] = -1; + folio_set_mm_id(folio, 0, MM_ID_DUMMY); + } else if (folio_mm_id(folio, 1) == mm_id) { + folio->_mm_id_mapcount[1] -= diff; + if (folio->_mm_id_mapcount[1] >= 0) + goto out; + folio->_mm_id_mapcount[1] = -1; + folio_set_mm_id(folio, 1, MM_ID_DUMMY); + } + + /* + * If one MM slot owns all mappings, the folio is mapped exclusively. + * Note that if the folio is now unmapped (new_mapcount_val == -1), both + * slots must be free (mapcount == -1), and we'll also mark it as + * exclusive. + */ + if (folio->_mm_id_mapcount[0] == new_mapcount_val || + folio->_mm_id_mapcount[1] == new_mapcount_val) + folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT; +out: + folio_unlock_large_mapcount(folio); +} +#else /* !CONFIG_MM_ID */ +/* + * See __folio_rmap_sanity_checks(), we might map large folios even without + * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now. + */ static inline void folio_set_large_mapcount(struct folio *folio, int mapcount, struct vm_area_struct *vma) { @@ -191,6 +355,7 @@ static inline void folio_sub_large_mapcount(struct folio *folio, { atomic_sub(diff, &folio->_large_mapcount); } +#endif /* CONFIG_MM_ID */ #define folio_inc_large_mapcount(folio, vma) \ folio_add_large_mapcount(folio, 1, vma) -- cgit v1.2.3 From 003fde4492c88ac3a1fee3d97b3834a679780af3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:07 +0100 Subject: mm: convert folio_likely_mapped_shared() to folio_maybe_mapped_shared() Let's reuse our new MM ownership tracking infrastructure for large folios to make folio_likely_mapped_shared() never return false negatives -- never indicating "not mapped shared" although the folio *is* mapped shared. With that, we can rename it to folio_maybe_mapped_shared() and get rid of the dependency on the mapcount of the first folio page. The semantics are now arguably clearer: no mixture of "false negatives" and "false positives", only the remaining possibility for "false positives". Thoroughly document the new semantics. We might now detect that a large folio is "maybe mapped shared" although it *no longer* is -- but once was. Now, if more than two MMs mapped a folio at the same time, and the MM mapping the folio exclusively at the end is not one tracked in the two folio MM slots, we will detect the folio as "maybe mapped shared". For anonymous folios, usually (except weird corner cases) all PTEs that target a "maybe mapped shared" folio are R/O. As soon as a child process would write to them (iow, actively use them), we would CoW and effectively replace these PTEs. Most cases (below) are not expected to really matter with large anonymous folios for this reason. Most importantly, there will be no change at all for: * small folios * hugetlb folios * PMD-mapped PMD-sized THPs (single mapping) This change has the potential to affect existing callers of folio_likely_mapped_shared() -> folio_maybe_mapped_shared(): (1) fs/proc/task_mmu.c: no change (hugetlb) (2) khugepaged counts PTEs that target shared folios towards max_ptes_shared (default: HPAGE_PMD_NR / 2), meaning we could skip a collapse where we would have previously collapsed. This only applies to anonymous folios and is not expected to matter in practice. Worth noting that this change sorts out case (A) documented in commit 1bafe96e89f0 ("mm/khugepaged: replace page_mapcount() check by folio_likely_mapped_shared()") by removing the possibility for "false negatives". (3) MADV_COLD / MADV_PAGEOUT / MADV_FREE will not try splitting PTE-mapped THPs that are considered shared but not fully covered by the requested range, consequently not processing them. PMD-mapped PMD-sized THP are not affected, or when all PTEs are covered. These functions are usually only called on anon/file folios that are exclusively mapped most of the time (no other file mappings or no fork()), so the "false negatives" are not expected to matter in practice. (4) mbind() / migrate_pages() / move_pages() will refuse to migrate shared folios unless MPOL_MF_MOVE_ALL is effective (requires CAP_SYS_NICE). We will now reject some folios that could be migrated. Similar to (3), especially with MPOL_MF_MOVE_ALL, so this is not expected to matter in practice. Note that cpuset_migrate_mm_workfn() calls do_migrate_pages() with MPOL_MF_MOVE_ALL. (5) NUMA hinting mm/migrate.c:migrate_misplaced_folio_prepare() will skip file folios that are probably shared libraries (-> "mapped shared" and executable). This check would have detected it as a shared library at some point (at least 3 MMs mapping it), so detecting it afterwards does not sound wrong (still a shared library). Not expected to matter. mm/memory.c:numa_migrate_check() will indicate TNF_SHARED in MAP_SHARED file mappings when encountering a shared folio. Similar reasoning, not expected to matter. mm/mprotect.c:change_pte_range() will skip folios detected as shared in CoW mappings. Similarly, this is not expected to matter in practice, but if it would ever be a problem we could relax that check a bit (e.g., basing it on the average page-mapcount in a folio), because it was only an optimization when many (e.g., 288) processes were mapping the same folios -- see commit 859d4adc3415 ("mm: numa: do not trap faults on shared data section pages.") (6) mm/rmap.c:folio_referenced_one() will skip exclusive swapbacked folios in dying processes. Applies to anonymous folios only. Without "false negatives", we'll now skip all actually shared ones. Skipping ones that are actually exclusive won't really matter, it's a pure optimization, and is not expected to matter in practice. In theory, one can detect the problematic scenario: folio_mapcount() > 0 and no folio MM slot is occupied ("state unknown"). One could reset the MM slots while doing an rmap walk, which migration / folio split already do when setting everything up. Further, when batching PTEs we might naturally learn about a owner (e.g., folio_mapcount() == nr_ptes) and could update the owner. However, we'll defer that until the scenarios where it would really matter are clear. Link: https://lkml.kernel.org/r/20250303163014.1128035-15-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/mm.h | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f366c180f2b6..82776b409391 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2251,23 +2251,18 @@ static inline size_t folio_size(const struct folio *folio) } /** - * folio_likely_mapped_shared - Estimate if the folio is mapped into the page - * tables of more than one MM + * folio_maybe_mapped_shared - Whether the folio is mapped into the page + * tables of more than one MM * @folio: The folio. * - * This function checks if the folio is currently mapped into more than one - * MM ("mapped shared"), or if the folio is only mapped into a single MM - * ("mapped exclusively"). + * This function checks if the folio maybe currently mapped into more than one + * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single + * MM ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * - * As precise information is not easily available for all folios, this function - * estimates the number of MMs ("sharers") that are currently mapping a folio - * using the number of times the first page of the folio is currently mapped - * into page tables. - * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are @@ -2275,8 +2270,8 @@ static inline size_t folio_size(const struct folio *folio) * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly - * indicate "mapped exclusively" (false negative) when the folio is - * only partially mapped into at least one MM. + * indicate "mapped shared" (false positive) if a folio was mapped by + * more than two MMs at one point in time. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. @@ -2293,7 +2288,7 @@ static inline size_t folio_size(const struct folio *folio) * * Return: Whether the folio is estimated to be mapped into more than one MM. */ -static inline bool folio_likely_mapped_shared(struct folio *folio) +static inline bool folio_maybe_mapped_shared(struct folio *folio) { int mapcount = folio_mapcount(folio); @@ -2301,16 +2296,22 @@ static inline bool folio_likely_mapped_shared(struct folio *folio) if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1; - /* A single mapping implies "mapped exclusively". */ - if (mapcount <= 1) - return false; - - /* If any page is mapped more than once we treat it "mapped shared". */ - if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio)) + /* + * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... + * simply assume "mapped shared", nobody should really care + * about this for arbitrary kernel allocations. + */ + if (!IS_ENABLED(CONFIG_MM_ID)) return true; - /* Let's guess based on the first subpage. */ - return atomic_read(&folio->_mapcount) > 0; + /* + * A single mapping implies "mapped exclusively", even if the + * folio flag says something different: it's easier to handle this + * case here instead of on the RMAP hot path. + */ + if (mapcount <= 1) + return false; + return folio_test_large_maybe_mapped_shared(folio); } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE -- cgit v1.2.3 From 749492229e3bd6222dda7267b8244135229d1fd8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Mar 2025 17:30:13 +0100 Subject: mm: stop maintaining the per-page mapcount of large folios (CONFIG_NO_PAGE_MAPCOUNT) Everything is in place to stop using the per-page mapcounts in large folios: the mapcount of tail pages will always be logically 0 (-1 value), just like it currently is for hugetlb folios already, and the page mapcount of the head page is either 0 (-1 value) or contains a page type (e.g., hugetlb). Maintaining _nr_pages_mapped without per-page mapcounts is impossible, so that one also has to go with CONFIG_NO_PAGE_MAPCOUNT. There are two remaining implications: (1) Per-node, per-cgroup and per-lruvec stats of "NR_ANON_MAPPED" ("mapped anonymous memory") and "NR_FILE_MAPPED" ("mapped file memory"): As soon as any page of the folio is mapped -- folio_mapped() -- we now account the complete folio as mapped. Once the last page is unmapped -- !folio_mapped() -- we account the complete folio as unmapped. This implies that ... * "AnonPages" and "Mapped" in /proc/meminfo and /sys/devices/system/node/*/meminfo * cgroup v2: "anon" and "file_mapped" in "memory.stat" and "memory.numa_stat" * cgroup v1: "rss" and "mapped_file" in "memory.stat" and "memory.numa_stat ... can now appear higher than before. But note that these folios do consume that memory, simply not all pages are actually currently mapped. It's worth nothing that other accounting in the kernel (esp. cgroup charging on allocation) is not affected by this change. [why oh why is "anon" called "rss" in cgroup v1] (2) Detecting partial mappings Detecting whether anon THPs are partially mapped gets a bit more unreliable. As long as a single MM maps such a large folio ("exclusively mapped"), we can reliably detect it. Especially before fork() / after a short-lived child process quit, we will detect partial mappings reliably, which is the common case. In essence, if the average per-page mapcount in an anon THP is < 1, we know for sure that we have a partial mapping. However, as soon as multiple MMs are involved, we might miss detecting partial mappings: this might be relevant with long-lived child processes. If we have a fully-mapped anon folio before fork(), once our child processes and our parent all unmap (zap/COW) the same pages (but not the complete folio), we might not detect the partial mapping. However, once the child processes quit we would detect the partial mapping. How relevant this case is in practice remains to be seen. Swapout/migration will likely mitigate this. In the future, RMAP walkers could check for that for that case (e.g., when collecting access bits during reclaim) and simply flag them for deferred-splitting. Link: https://lkml.kernel.org/r/20250303163014.1128035-21-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcow (Oracle) Cc: Michal Koutn Cc: Muchun Song Cc: tejun heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/rmap.h | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c131b0efff0f..6b82b618846e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -240,7 +240,7 @@ static __always_inline void folio_set_large_mapcount(struct folio *folio, folio_set_mm_id(folio, 0, vma->vm_mm->mm_id); } -static __always_inline void folio_add_large_mapcount(struct folio *folio, +static __always_inline int folio_add_return_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { const mm_id_t mm_id = vma->vm_mm->mm_id; @@ -286,9 +286,11 @@ static __always_inline void folio_add_large_mapcount(struct folio *folio, folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; } folio_unlock_large_mapcount(folio); + return new_mapcount_val + 1; } +#define folio_add_large_mapcount folio_add_return_large_mapcount -static __always_inline void folio_sub_large_mapcount(struct folio *folio, +static __always_inline int folio_sub_return_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { const mm_id_t mm_id = vma->vm_mm->mm_id; @@ -331,7 +333,9 @@ static __always_inline void folio_sub_large_mapcount(struct folio *folio, folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT; out: folio_unlock_large_mapcount(folio); + return new_mapcount_val + 1; } +#define folio_sub_large_mapcount folio_sub_return_large_mapcount #else /* !CONFIG_MM_ID */ /* * See __folio_rmap_sanity_checks(), we might map large folios even without @@ -350,17 +354,33 @@ static inline void folio_add_large_mapcount(struct folio *folio, atomic_add(diff, &folio->_large_mapcount); } +static inline int folio_add_return_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + BUILD_BUG(); +} + static inline void folio_sub_large_mapcount(struct folio *folio, int diff, struct vm_area_struct *vma) { atomic_sub(diff, &folio->_large_mapcount); } + +static inline int folio_sub_return_large_mapcount(struct folio *folio, + int diff, struct vm_area_struct *vma) +{ + BUILD_BUG(); +} #endif /* CONFIG_MM_ID */ #define folio_inc_large_mapcount(folio, vma) \ folio_add_large_mapcount(folio, 1, vma) +#define folio_inc_return_large_mapcount(folio, vma) \ + folio_add_return_large_mapcount(folio, 1, vma) #define folio_dec_large_mapcount(folio, vma) \ folio_sub_large_mapcount(folio, 1, vma) +#define folio_dec_return_large_mapcount(folio, vma) \ + folio_sub_return_large_mapcount(folio, 1, vma) /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; @@ -538,9 +558,11 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, break; } - do { - atomic_inc(&page->_mapcount); - } while (page++, --nr_pages > 0); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { + do { + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + } folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; case RMAP_LEVEL_PMD: @@ -638,7 +660,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, do { if (PageAnonExclusive(page)) ClearPageAnonExclusive(page); - atomic_inc(&page->_mapcount); + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + atomic_inc(&page->_mapcount); } while (page++, --nr_pages > 0); folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; -- cgit v1.2.3 From f7f0d88b7d6da29d2b073740aa130685f0e18609 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Mar 2025 14:27:29 -0800 Subject: mm/damon/core: expose damos_filter_for_ops() to DAMON kernel API callers damos_filter_for_ops() can be useful to avoid putting wrong type of filters in wrong place. Make it be exposed to DAMON kernel API callers. Link: https://lkml.kernel.org/r/20250305222733.59089-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 52559475dbe7..eed008b64a23 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -894,6 +894,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damos_filter *damos_new_filter(enum damos_filter_type type, bool matching, bool allow); void damos_add_filter(struct damos *s, struct damos_filter *f); +bool damos_filter_for_ops(enum damos_filter_type type); void damos_destroy_filter(struct damos_filter *f); struct damos_quota_goal *damos_new_quota_goal( -- cgit v1.2.3 From ff22f9299d7b2c7874b560993c21543708b7e1b6 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 6 Mar 2025 12:50:10 -0800 Subject: page_io: zswap: do not crash the kernel on decompression failure Currently, we crash the kernel when a decompression failure occurs in zswap (either because of memory corruption, or a bug in the compression algorithm). This is overkill. We should only SIGBUS the unfortunate process asking for the zswap entry on zswap load, and skip the corrupted entry in zswap writeback. See [1] for a recent upstream discussion about this. The zswap writeback case is relatively straightforward to fix. For the zswap_load() case, we change the return behavior: * Return 0 on success. * Return -ENOENT (with the folio locked) if zswap does not own the swapped out content. * Return -EIO if zswap owns the swapped out content, but encounters a decompression failure for some reasons. The folio will be unlocked, but not be marked up-to-date, which will eventually cause the process requesting the page to SIGBUS (see the handling of not-up-to-date folio in do_swap_page() in mm/memory.c), without crashing the kernel. * Return -EINVAL if we encounter a large folio, as large folio should not be swapped in while zswap is being used. Similar to the -EIO case, we also unlock the folio but do not mark it as up-to-date to SIGBUS the faulting process. As a side effect, we require one extra zswap tree traversal in the load and writeback paths. Quick benchmarking on a kernel build test shows no performance difference: With the new scheme: real: mean: 125.1s, stdev: 0.12s user: mean: 3265.23s, stdev: 9.62s sys: mean: 2156.41s, stdev: 13.98s The old scheme: real: mean: 125.78s, stdev: 0.45s user: mean: 3287.18s, stdev: 5.95s sys: mean: 2177.08s, stdev: 26.52s [nphamcs@gmail.com: fix documentation of zswap_load()] Link: https://lkml.kernel.org/r/20250306222453.1269456-1-nphamcs@gmail.com Link: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@casper.infradead.org/ [1] Link: https://lkml.kernel.org/r/20250306205011.784787-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Matthew Wilcox Suggested-by: Yosry Ahmed Suggested-by: Johannes Weiner Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/zswap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index d961ead91bf1..30c193a1207e 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -26,7 +26,7 @@ struct zswap_lruvec_state { unsigned long zswap_total_pages(void); bool zswap_store(struct folio *folio); -bool zswap_load(struct folio *folio); +int zswap_load(struct folio *folio); void zswap_invalidate(swp_entry_t swp); int zswap_swapon(int type, unsigned long nr_pages); void zswap_swapoff(int type); @@ -44,9 +44,9 @@ static inline bool zswap_store(struct folio *folio) return false; } -static inline bool zswap_load(struct folio *folio) +static inline int zswap_load(struct folio *folio) { - return false; + return -ENOENT; } static inline void zswap_invalidate(swp_entry_t swp) {} -- cgit v1.2.3 From 8268af309d07d1c6279080b4e6fd16ec75cc977c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:59 +0200 Subject: arch, mm: set max_mapnr when allocating memory map for FLATMEM max_mapnr is essentially the size of the memory map for systems that use FLATMEM. There is no reason to calculate it in each and every architecture when it's anyway calculated in alloc_node_mem_map(). Drop setting of max_mapnr from architecture code and set it once in alloc_node_mem_map(). While on it, move definition of mem_map and max_mapnr to mm/mm_init.c so there won't be two copies for MMU and !MMU variants. Link: https://lkml.kernel.org/r/20250313135003.836600-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 82776b409391..8c4cb8c28507 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -46,17 +46,6 @@ extern int sysctl_page_lock_unfairness; void mm_core_init(void); void init_mm_internals(void); -#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ -extern unsigned long max_mapnr; - -static inline void set_max_mapnr(unsigned long limit) -{ - max_mapnr = limit; -} -#else -static inline void set_max_mapnr(unsigned long limit) { } -#endif - extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { -- cgit v1.2.3 From 6faea3422e3b4e8de44a55aa3e6e843320da66d2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:01 +0200 Subject: arch, mm: streamline HIGHMEM freeing All architectures that support HIGHMEM have their code that frees high memory pages to the buddy allocator while __free_memory_core() is limited to freeing only low memory. There is no actual reason for that. The memory map is completely ready by the time memblock_free_all() is called and high pages can be released to the buddy allocator along with low memory. Remove low memory limit from __free_memory_core() and drop per-architecture code that frees high memory pages. Link: https://lkml.kernel.org/r/20250313135003.836600-12-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8c4cb8c28507..6c519a5098d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3275,7 +3275,6 @@ extern void reserve_bootmem_region(phys_addr_t start, /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); -#define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { -- cgit v1.2.3 From 0d98484ee3330c35d1dc0da0be0871b3596cbe0d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:02 +0200 Subject: arch, mm: introduce arch_mm_preinit Currently, implementation of mem_init() in every architecture consists of one or more of the following: * initializations that must run before page allocator is active, for instance swiotlb_init() * a call to memblock_free_all() to release all the memory to the buddy allocator * initializations that must run after page allocator is ready and there is no arch-specific hook other than mem_init() for that, like for example register_page_bootmem_info() in x86 and sparc64 or simple setting of mem_init_done = 1 in several architectures * a bunch of semi-related stuff that apparently had no better place to live, for example a ton of BUILD_BUG_ON()s in parisc. Introduce arch_mm_preinit() that will be the first thing called from mm_core_init(). On architectures that have initializations that must happen before the page allocator is ready, move those into arch_mm_preinit() along with the code that does not depend on ordering with page allocator setup. On several architectures this results in reduction of mem_init() to a single call to memblock_free_all() that allows its consolidation next. Link: https://lkml.kernel.org/r/20250313135003.836600-13-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6c519a5098d4..c417e5634a58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -43,6 +43,7 @@ struct folio_batch; extern int sysctl_page_lock_unfairness; +void arch_mm_preinit(void); void mm_core_init(void); void init_mm_internals(void); -- cgit v1.2.3 From 8afa901c147a41f92e83943cddf154bbb7995ee6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:03 +0200 Subject: arch, mm: make releasing of memory to page allocator more explicit The point where the memory is released from memblock to the buddy allocator is hidden inside arch-specific mem_init()s and the call to memblock_free_all() is needlessly duplicated in every artiste cure and after introduction of arch_mm_preinit() hook, mem_init() implementation on many architecture only contains the call to memblock_free_all(). Pull memblock_free_all() call into mm_core_init() and drop mem_init() on relevant architectures to make it more explicit where the free memory is released from memblock to the buddy allocator and to reduce code duplication in architecture specific code. Link: https://lkml.kernel.org/r/20250313135003.836600-14-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Acked-by: Geert Uytterhoeven [m68k] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memblock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e79eb6ac516f..ef5a1ecc6e59 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -133,7 +133,6 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); -void memblock_free_all(void); void memblock_free(void *ptr, size_t size); void reset_all_zones_managed_pages(void); -- cgit v1.2.3 From 53058c762afff714ceaec14b87cf8fdce3b6d33e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:04 -0800 Subject: mm/damon: remove damon_callback->private The field was added to let users keep their personal data to use inside of the callbacks. However, no one is actively using that now. Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index eed008b64a23..dab4bb0fe39d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -609,12 +609,10 @@ struct damon_operations { * @after_aggregation: Called after each aggregation. * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. - * @private: User private data. * * The monitoring thread (&damon_ctx.kdamond) calls @before_start and * @before_terminate just before starting and finishing the monitoring, - * respectively. Therefore, those are good places for installing and cleaning - * @private. + * respectively. * * The monitoring thread calls @after_wmarks_check after each DAMON-based * operation schemes' watermarks check. If users need to make changes to the @@ -630,8 +628,6 @@ struct damon_operations { * If any callback returns non-zero, monitoring stops. */ struct damon_callback { - void *private; - int (*before_start)(struct damon_ctx *context); int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); -- cgit v1.2.3 From 07da21855b270c17b2a2d20e644c1419fcaafdd1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:05 -0800 Subject: mm/damon: remove ->before_start of damon_callback The function pointer field was added to be used as a place to do some initialization works just before DAMON starts working. However, nobody is using it now. Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index dab4bb0fe39d..043de2408c65 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -603,16 +603,14 @@ struct damon_operations { /** * struct damon_callback - Monitoring events notification callbacks. * - * @before_start: Called before starting the monitoring. * @after_wmarks_check: Called after each schemes' watermarks check. * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. * - * The monitoring thread (&damon_ctx.kdamond) calls @before_start and - * @before_terminate just before starting and finishing the monitoring, - * respectively. + * The monitoring thread (&damon_ctx.kdamond) calls @before_terminate just + * before finishing the monitoring. * * The monitoring thread calls @after_wmarks_check after each DAMON-based * operation schemes' watermarks check. If users need to make changes to the @@ -628,7 +626,6 @@ struct damon_operations { * If any callback returns non-zero, monitoring stops. */ struct damon_callback { - int (*before_start)(struct damon_ctx *context); int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); -- cgit v1.2.3 From cedee98f68875605dad644a95a63eae04de250b2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:06 -0800 Subject: mm/damon: remove damon_callback->after_sampling The callback was used by DAMON sysfs interface for reading DAMON internal data. But it is no more being used, and damon_call() can do similar works in a better way. Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 043de2408c65..5aa277f4c948 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -604,7 +604,6 @@ struct damon_operations { * struct damon_callback - Monitoring events notification callbacks. * * @after_wmarks_check: Called after each schemes' watermarks check. - * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. @@ -617,17 +616,15 @@ struct damon_operations { * attributes of the monitoring context while it's deactivated due to the * watermarks, this is the good place to do. * - * The monitoring thread calls @after_sampling and @after_aggregation for each - * of the sampling intervals and aggregation intervals, respectively. - * Therefore, users can safely access the monitoring results without additional - * protection. For the reason, users are recommended to use these callback for - * the accesses to the results. + * The monitoring thread calls @after_aggregation for each of the aggregation + * intervals. Therefore, users can safely access the monitoring results + * without additional protection. For the reason, users are recommended to use + * these callback for the accesses to the results. * * If any callback returns non-zero, monitoring stops. */ struct damon_callback { int (*after_wmarks_check)(struct damon_ctx *context); - int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); int (*before_damos_apply)(struct damon_ctx *context, struct damon_target *target, -- cgit v1.2.3 From 99ce7c9c6d855716356f2f839c53905592fd780b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:07 -0800 Subject: mm/damon: remove damon_callback->before_damos_apply The hook was introduced to let DAMON kernel API users access DAMOS schemes-eligible regions in a safe way. Now it is no more used by anyone, and the functionality is provided in a better way by damos_walk(). Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5aa277f4c948..be7b281fb922 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -605,7 +605,6 @@ struct damon_operations { * * @after_wmarks_check: Called after each schemes' watermarks check. * @after_aggregation: Called after each aggregation. - * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. * * The monitoring thread (&damon_ctx.kdamond) calls @before_terminate just @@ -626,10 +625,6 @@ struct damon_operations { struct damon_callback { int (*after_wmarks_check)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); - int (*before_damos_apply)(struct damon_ctx *context, - struct damon_target *target, - struct damon_region *region, - struct damos *scheme); void (*before_terminate)(struct damon_ctx *context); }; -- cgit v1.2.3 From 105f830fa35c49ada7db785a7f9b70386f193529 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:59:08 -0800 Subject: mm/damon: remove damon_operations->reset_aggregated The operations layer hook was introduced to let operations set do any aggregation data reset if needed. But it is not really be used now. Remove it. Link: https://lkml.kernel.org/r/20250306175908.66300-14-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index be7b281fb922..3db4f77261f5 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -542,7 +542,6 @@ enum damon_ops_id { * @update: Update operations-related data structures. * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. - * @reset_aggregated: Reset aggregated accesses monitoring results. * @get_scheme_score: Get the score of a region for a scheme. * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. @@ -554,8 +553,7 @@ enum damon_ops_id { * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting * the monitoring, @update after each &damon_attrs.ops_update_interval, and * @check_accesses, @target_valid and @prepare_access_checks after each - * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after - * each &damon_attrs.aggr_interval. + * &damon_attrs.sample_interval. * * Each &struct damon_operations instance having valid @id can be registered * via damon_register_ops() and selected by damon_select_ops() later. @@ -570,8 +568,6 @@ enum damon_ops_id { * last preparation and update the number of observed accesses of each region. * It should also return max number of observed accesses that made as a result * of its update. The value will be used for regions adjustment threshold. - * @reset_aggregated should reset the access monitoring results that aggregated - * by @check_accesses. * @get_scheme_score should return the priority score of a region for a scheme * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided @@ -589,7 +585,6 @@ struct damon_operations { void (*update)(struct damon_ctx *context); void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); - void (*reset_aggregated)(struct damon_ctx *context); int (*get_scheme_score)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); -- cgit v1.2.3 From 9039b9096ea27a20f0349d1537537663c935c8ed Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Thu, 6 Mar 2025 17:44:50 -0500 Subject: mm: page_ext: add an iteration API for page extensions Patch series "mm: page_ext: Introduce new iteration API", v3. Introduction ============ [ Thanks to David Hildenbrand for identifying the root cause of this issue and proving guidance on how to fix it. The new API idea, bugs and misconceptions are all mine though ] Currently, trying to reserve 1G pages with page_owner=on and sparsemem causes a crash. The reproducer is very simple: 1. Build the kernel with CONFIG_SPARSEMEM=y and the table extensions 2. Pass 'default_hugepagesz=1 page_owner=on' in the kernel command-line 3. Reserve one 1G page at run-time, this should crash (see patch 1 for backtrace) [ A crash with page_table_check is also possible, but harder to trigger ] Apparently, starting with commit cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for gigantic folios") we now pass the full allocation order to page extension clients and the page extension implementation assumes that all PFNs of an allocation range will be stored in the same memory section (which is not true for 1G pages). To fix this, this series introduces a new iteration API for page extension objects. The API checks if the next page extension object can be retrieved from the current section or if it needs to look up for it in another section. Please, find all details in patch 1. I tested this series on arm64 and x86 by reserving 1G pages at run-time and doing kernel builds (always with page_owner=on and page_table_check=on). This patch (of 3): The page extension implementation assumes that all page extensions of a given page order are stored in the same memory section. The function page_ext_next() relies on this assumption by adding an offset to the current object to return the next adjacent page extension. This behavior works as expected for flatmem but fails for sparsemem when using 1G pages. The commit cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for gigantic folios") exposes this issue, making it possible for a crash when using page_owner or page_table_check page extensions. The problem is that for 1G pages, the page extensions may span memory section boundaries and be stored in different memory sections. This issue was not visible before commit cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for gigantic folios") because alloc_contig_pages() never passed more than MAX_PAGE_ORDER to post_alloc_hook(). However, the series introducing mentioned commit changed this behavior allowing the full 1G page order to be passed. Reproducer: 1. Build the kernel with CONFIG_SPARSEMEM=y and table extensions support 2. Pass 'default_hugepagesz=1 page_owner=on' in the kernel command-line 3. Reserve one 1G page at run-time, this should crash (backtrace below) To address this issue, this commit introduces a new API for iterating through page extensions. The main iteration macro is for_each_page_ext() and it must be called with the RCU read lock taken. Here's an usage example: """ struct page_ext_iter iter; struct page_ext *page_ext; ... rcu_read_lock(); for_each_page_ext(page, 1 << order, page_ext, iter) { struct my_page_ext *obj = get_my_page_ext_obj(page_ext); ... } rcu_read_unlock(); """ The loop construct uses page_ext_iter_next() which checks to see if we have crossed sections in the iteration. In this case, page_ext_iter_next() retrieves the next page_ext object from another section. Thanks to David Hildenbrand for helping identify the root cause and providing suggestions on how to fix and optmize the solution (final implementation and bugs are all mine through). Lastly, here's the backtrace, without kasan you can get random crashes: [ 76.052526] BUG: KASAN: slab-out-of-bounds in __update_page_owner_handle+0x238/0x298 [ 76.060283] Write of size 4 at addr ffff07ff96240038 by task tee/3598 [ 76.066714] [ 76.068203] CPU: 88 UID: 0 PID: 3598 Comm: tee Kdump: loaded Not tainted 6.13.0-rep1 #3 [ 76.076202] Hardware name: WIWYNN Mt.Jade Server System B81.030Z1.0007/Mt.Jade Motherboard, BIOS 2.10.20220810 (SCP: 2.10.20220810) 2022/08/10 [ 76.088972] Call trace: [ 76.091411] show_stack+0x20/0x38 (C) [ 76.095073] dump_stack_lvl+0x80/0xf8 [ 76.098733] print_address_description.constprop.0+0x88/0x398 [ 76.104476] print_report+0xa8/0x278 [ 76.108041] kasan_report+0xa8/0xf8 [ 76.111520] __asan_report_store4_noabort+0x20/0x30 [ 76.116391] __update_page_owner_handle+0x238/0x298 [ 76.121259] __set_page_owner+0xdc/0x140 [ 76.125173] post_alloc_hook+0x190/0x1d8 [ 76.129090] alloc_contig_range_noprof+0x54c/0x890 [ 76.133874] alloc_contig_pages_noprof+0x35c/0x4a8 [ 76.138656] alloc_gigantic_folio.isra.0+0x2c0/0x368 [ 76.143616] only_alloc_fresh_hugetlb_folio.isra.0+0x24/0x150 [ 76.149353] alloc_pool_huge_folio+0x11c/0x1f8 [ 76.153787] set_max_huge_pages+0x364/0xca8 [ 76.157961] __nr_hugepages_store_common+0xb0/0x1a0 [ 76.162829] nr_hugepages_store+0x108/0x118 [ 76.167003] kobj_attr_store+0x3c/0x70 [ 76.170745] sysfs_kf_write+0xfc/0x188 [ 76.174492] kernfs_fop_write_iter+0x274/0x3e0 [ 76.178927] vfs_write+0x64c/0x8e0 [ 76.182323] ksys_write+0xf8/0x1f0 [ 76.185716] __arm64_sys_write+0x74/0xb0 [ 76.189630] invoke_syscall.constprop.0+0xd8/0x1e0 [ 76.194412] do_el0_svc+0x164/0x1e0 [ 76.197891] el0_svc+0x40/0xe0 [ 76.200939] el0t_64_sync_handler+0x144/0x168 [ 76.205287] el0t_64_sync+0x1ac/0x1b0 Link: https://lkml.kernel.org/r/cover.1741301089.git.luizcap@redhat.com Link: https://lkml.kernel.org/r/a45893880b7e1601082d39d2c5c8b50bcc096305.1741301089.git.luizcap@redhat.com Fixes: cf54f310d0d3 ("mm/hugetlb: use __GFP_COMP for gigantic folios") Signed-off-by: Luiz Capitulino Acked-by: David Hildenbrand Cc: Johannes Weiner Cc: Luiz Capitulino Cc: Muchun Song Cc: Pasha Tatashin Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index e4b48a0dda24..76c817162d2f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -3,6 +3,7 @@ #define __LINUX_PAGE_EXT_H #include +#include #include struct pglist_data; @@ -69,16 +70,31 @@ extern void page_ext_init(void); static inline void page_ext_init_flatmem_late(void) { } + +static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) +{ + /* + * page_ext is allocated per memory section. Once we cross a + * memory section, we have to fetch the new pointer. + */ + return next_pfn % PAGES_PER_SECTION; +} #else extern void page_ext_init_flatmem(void); extern void page_ext_init_flatmem_late(void); static inline void page_ext_init(void) { } + +static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) +{ + return true; +} #endif extern struct page_ext *page_ext_get(const struct page *page); extern void page_ext_put(struct page_ext *page_ext); +extern struct page_ext *page_ext_lookup(unsigned long pfn); static inline void *page_ext_data(struct page_ext *page_ext, struct page_ext_operations *ops) @@ -93,6 +109,83 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr) return next; } +struct page_ext_iter { + unsigned long index; + unsigned long start_pfn; + struct page_ext *page_ext; +}; + +/** + * page_ext_iter_begin() - Prepare for iterating through page extensions. + * @iter: page extension iterator. + * @pfn: PFN of the page we're interested in. + * + * Must be called with RCU read lock taken. + * + * Return: NULL if no page_ext exists for this page. + */ +static inline struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter, + unsigned long pfn) +{ + iter->index = 0; + iter->start_pfn = pfn; + iter->page_ext = page_ext_lookup(pfn); + + return iter->page_ext; +} + +/** + * page_ext_iter_next() - Get next page extension + * @iter: page extension iterator. + * + * Must be called with RCU read lock taken. + * + * Return: NULL if no next page_ext exists. + */ +static inline struct page_ext *page_ext_iter_next(struct page_ext_iter *iter) +{ + unsigned long pfn; + + if (WARN_ON_ONCE(!iter->page_ext)) + return NULL; + + iter->index++; + pfn = iter->start_pfn + iter->index; + + if (page_ext_iter_next_fast_possible(pfn)) + iter->page_ext = page_ext_next(iter->page_ext); + else + iter->page_ext = page_ext_lookup(pfn); + + return iter->page_ext; +} + +/** + * page_ext_iter_get() - Get current page extension + * @iter: page extension iterator. + * + * Return: NULL if no page_ext exists for this iterator. + */ +static inline struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter) +{ + return iter->page_ext; +} + +/** + * for_each_page_ext(): iterate through page_ext objects. + * @__page: the page we're interested in + * @__pgcount: how many pages to iterate through + * @__page_ext: struct page_ext pointer where the current page_ext + * object is returned + * @__iter: struct page_ext_iter object (defined in the stack) + * + * IMPORTANT: must be called with RCU read lock taken. + */ +#define for_each_page_ext(__page, __pgcount, __page_ext, __iter) \ + for (__page_ext = page_ext_iter_begin(&__iter, page_to_pfn(__page));\ + __page_ext && __iter.index < __pgcount; \ + __page_ext = page_ext_iter_next(&__iter)) + #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; -- cgit v1.2.3 From 3fec86f8aa8c7ae84567a0d1396e84ada96141d8 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 7 Mar 2025 12:39:54 -0500 Subject: xarray: add xas_try_split() to split a multi-index entry Patch series "Buddy allocator like (or non-uniform) folio split", v10. This patchset adds a new buddy allocator like (or non-uniform) large folio split from a order-n folio to order-m with m < n. It reduces 1. the total number of after-split folios from 2^(n-m) to n-m+1; 2. the amount of memory needed for multi-index xarray split from 2^(n/6-m/6) to n/6-m/6, assuming XA_CHUNK_SHIFT=6; 3. keep more large folios after a split from all order-m folios to order-(n-1) to order-m folios. For example, to split an order-9 to order-0, folio split generates 10 (or 11 for anonymous memory) folios instead of 512, allocates 1 xa_node instead of 8, and leaves 1 order-8, 1 order-7, ..., 1 order-1 and 2 order-0 folios (or 4 order-0 for anonymous memory) instead of 512 order-0 folios. Instead of duplicating existing split_huge_page*() code, __folio_split() is introduced as the shared backend code for both split_huge_page_to_list_to_order() and folio_split(). __folio_split() can support both uniform split and buddy allocator like (or non-uniform) split. All existing split_huge_page*() users can be gradually converted to use folio_split() if possible. In this patchset, I converted truncate_inode_partial_folio() to use folio_split(). xfstests quick group passed for both tmpfs and xfs. I also semi-replicated Hugh's test[12] and ran it without any issue for almost 24 hours. This patch (of 8): A preparation patch for non-uniform folio split, which always split a folio into half iteratively, and minimal xarray entry split. Currently, xas_split_alloc() and xas_split() always split all slots from a multi-index entry. They cost the same number of xa_node as the to-be-split slots. For example, to split an order-9 entry, which takes 2^(9-6)=8 slots, assuming XA_CHUNK_SHIFT is 6 (!CONFIG_BASE_SMALL), 8 xa_node are needed. Instead xas_try_split() is intended to be used iteratively to split the order-9 entry into 2 order-8 entries, then split one order-8 entry, based on the given index, to 2 order-7 entries, ..., and split one order-1 entry to 2 order-0 entries. When splitting the order-6 entry and a new xa_node is needed, xas_try_split() will try to allocate one if possible. As a result, xas_try_split() would only need 1 xa_node instead of 8. When a new xa_node is needed during the split, xas_try_split() can try to allocate one but no more. -ENOMEM will be return if a node cannot be allocated. -EINVAL will be return if a sibling node is split or cascade split happens, where two or more new nodes are needed, and these are not supported by xas_try_split(). xas_split_alloc() and xas_split() split an order-9 to order-0: --------------------------------- | | | | | | | | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | | | | | | | | --------------------------------- | | | | ------- --- --- ------- | | ... | | V V V V ----------- ----------- ----------- ----------- | xa_node | | xa_node | ... | xa_node | | xa_node | ----------- ----------- ----------- ----------- xas_try_split() splits an order-9 to order-0: --------------------------------- | | | | | | | | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | | | | | | | | --------------------------------- | | V ----------- | xa_node | ----------- Link: https://lkml.kernel.org/r/20250307174001.242794-1-ziy@nvidia.com Link: https://lkml.kernel.org/r/20250307174001.242794-2-ziy@nvidia.com Signed-off-by: Zi Yan Cc: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: John Hubbard Cc: Kefeng Wang Cc: Kirill A. Shuemov Cc: Miaohe Lin Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zi Yan Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/xarray.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 0b618ec04115..4010195201c9 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1555,6 +1555,7 @@ int xa_get_order(struct xarray *, unsigned long index); int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); +void xas_try_split(struct xa_state *xas, void *entry, unsigned int order); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { @@ -1576,6 +1577,11 @@ static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } + +static inline void xas_try_split(struct xa_state *xas, void *entry, + unsigned int order) +{ +} #endif /** -- cgit v1.2.3 From 7460b470a131f985a70302a322617121efdd7caa Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 7 Mar 2025 12:40:00 -0500 Subject: mm/truncate: use folio_split() in truncate operation Instead of splitting the large folio uniformly during truncation, try to use buddy allocator like folio_split() at the start and the end of a truncation range to minimize the number of resulting folios if it is supported. try_folio_split() is introduced to use folio_split() if supported and it falls back to uniform split otherwise. For example, to truncate a order-4 folio [0, 1, 2, 3, 4, 5, ..., 15] between [3, 10] (inclusive), folio_split() splits the folio at 3 to [0,1], [2], [3], [4..7], [8..15] and [3], [4..7] can be dropped and [8..15] is kept with zeros in [8..10], then another folio_split() is done at 10, so [8..10] can be dropped. One possible optimization is to make folio_split() to split a folio based on a given range, like [3..10] above. But that complicates folio_split(), so it will be investigated when necessary. Link: https://lkml.kernel.org/r/20250226210032.2044041-8-ziy@nvidia.com Link: https://lkml.kernel.org/r/20250307174001.242794-8-ziy@nvidia.com Signed-off-by: Zi Yan Cc: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: John Hubbard Cc: Kefeng Wang Cc: Kirill A. Shuemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e57e811cfd3c..e893d546a49f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -345,6 +345,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); +bool uniform_split_supported(struct folio *folio, unsigned int new_order, + bool warns); +bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, + bool warns); +int folio_split(struct folio *folio, unsigned int new_order, struct page *page, + struct list_head *list); +/* + * try_folio_split - try to split a @folio at @page using non uniform split. + * @folio: folio to be split + * @page: split to order-0 at the given page + * @list: store the after-split folios + * + * Try to split a @folio at @page using non uniform split to order-0, if + * non uniform split is not supported, fall back to uniform split. + * + * Return: 0: split is successful, otherwise split failed. + */ +static inline int try_folio_split(struct folio *folio, struct page *page, + struct list_head *list) +{ + int ret = min_order_for_split(folio); + + if (ret < 0) + return ret; + + if (!non_uniform_split_supported(folio, 0, false)) + return split_huge_page_to_list_to_order(&folio->page, list, + ret); + return folio_split(folio, ret, page, list); +} static inline int split_huge_page(struct page *page) { struct folio *folio = page_folio(page); @@ -537,6 +567,12 @@ static inline int split_folio_to_list(struct folio *folio, struct list_head *lis return 0; } +static inline int try_folio_split(struct folio *folio, struct page *page, + struct list_head *list) +{ + return 0; +} + static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) -- cgit v1.2.3 From 200a89c159a7a416115e6e309183c82183bf98aa Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 14 Mar 2025 18:21:12 -0400 Subject: mm/filemap: use xas_try_split() in __filemap_add_folio() Patch series "Minimize xa_node allocation during xarry split", v3. When splitting a multi-index entry in XArray from order-n to order-m, existing xas_split_alloc()+xas_split() approach requires 2^(n % XA_CHUNK_SHIFT) xa_node allocations. But its callers, __filemap_add_folio() and shmem_split_large_entry(), use at most 1 xa_node. To minimize xa_node allocation and remove the limitation of no split from order-12 (or above) to order-0 (or anything between 0 and 5)[1], xas_try_split() was added[2], which allocates (n / XA_CHUNK_SHIFT - m / XA_CHUNK_SHIFT) xa_node. It is used for non-uniform folio split, but can be used by __filemap_add_folio() and shmem_split_large_entry(). xas_split_alloc() and xas_split() split an order-9 to order-0: --------------------------------- | | | | | | | | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | | | | | | | | --------------------------------- | | | | ------- --- --- ------- | | ... | | V V V V ----------- ----------- ----------- ----------- | xa_node | | xa_node | ... | xa_node | | xa_node | ----------- ----------- ----------- ----------- xas_try_split() splits an order-9 to order-0: --------------------------------- | | | | | | | | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | | | | | | | | --------------------------------- | | V ----------- | xa_node | ----------- xas_try_split() is designed to be called iteratively with n = m + 1. xas_try_split_mini_order() is added to minmize the number of calls to xas_try_split() by telling the caller the next minimal order to split to instead of n - 1. Splitting order-n to order-m when m= l * XA_CHUNK_SHIFT does not require xa_node allocation and requires 1 xa_node when n=l * XA_CHUNK_SHIFT and m = n - 1, so it is OK to use xas_try_split() with n > m + 1 when no new xa_node is needed. xfstests quick group test passed on xfs and tmpfs. [1] https://lore.kernel.org/linux-mm/Z6YX3RznGLUD07Ao@casper.infradead.org/ [2] https://lore.kernel.org/linux-mm/20250226210032.2044041-1-ziy@nvidia.com/ This patch (of 2): During __filemap_add_folio(), a shadow entry is covering n slots and a folio covers m slots with m < n is to be added. Instead of splitting all n slots, only the m slots covered by the folio need to be split and the remaining n-m shadow entries can be retained with orders ranging from m to n-1. This method only requires (n/XA_CHUNK_SHIFT) - (m/XA_CHUNK_SHIFT) new xa_nodes instead of (n % XA_CHUNK_SHIFT) * ((n/XA_CHUNK_SHIFT) - (m/XA_CHUNK_SHIFT)) new xa_nodes, compared to the original xas_split_alloc() + xas_split() one. For example, to insert an order-0 folio when an order-9 shadow entry is present (assuming XA_CHUNK_SHIFT is 6), 1 xa_node is needed instead of 8. xas_try_split_min_order() is introduced to reduce the number of calls to xas_try_split() during split. Link: https://lkml.kernel.org/r/20250314222113.711703-1-ziy@nvidia.com Link: https://lkml.kernel.org/r/20250314222113.711703-2-ziy@nvidia.com Signed-off-by: Zi Yan Cc: Baolin Wang Cc: Hugh Dickins Cc: Kairui Song Cc: Miaohe Lin Cc: Mattew Wilcox Cc: David Hildenbrand Cc: John Hubbard Cc: Kefeng Wang Cc: Kirill A. Shuemov Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/xarray.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 4010195201c9..78eede109b1a 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1556,6 +1556,7 @@ int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); void xas_try_split(struct xa_state *xas, void *entry, unsigned int order); +unsigned int xas_try_split_min_order(unsigned int order); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { @@ -1582,6 +1583,12 @@ static inline void xas_try_split(struct xa_state *xas, void *entry, unsigned int order) { } + +static inline unsigned int xas_try_split_min_order(unsigned int order) +{ + return 0; +} + #endif /** -- cgit v1.2.3 From 61659efdb35ce6c6ac7639342098f3c4548b794b Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 12 Mar 2025 09:30:43 +1000 Subject: drivers/base/memory: improve add_boot_memory_block() Patch series "drivers/base/memory: Two cleanups", v3. Two cleanups to drivers/base/memory. This patch (of 2)L It's unnecessary to count the present sections for the specified block since the block will be added if any section in the block is present. Besides, for_each_present_section_nr() can be reused as Andrew Morton suggested. Improve by using for_each_present_section_nr() and dropping the unnecessary @section_count. No functional changes intended. Link: https://lkml.kernel.org/r/20250311233045.148943-1-gshan@redhat.com Link: https://lkml.kernel.org/r/20250311233045.148943-2-gshan@redhat.com Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 550dbba92521..dbb0ad69e17f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2140,6 +2140,11 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) return -1; } +#define for_each_present_section_nr(start, section_nr) \ + for (section_nr = next_present_section_nr(start - 1); \ + section_nr != -1; \ + section_nr = next_present_section_nr(section_nr)) + /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate -- cgit v1.2.3 From 1a24776fca39ed79d7f5e0b0592b5addd784981e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 12 Mar 2025 09:30:44 +1000 Subject: drivers/base/memory: correct the field name in the header Replace @blocks with @memory_blocks to match with the definition of struct memory_group. Link: https://lkml.kernel.org/r/20250311233045.148943-3-gshan@redhat.com Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index c0afee5d126e..12daa6ec7d09 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -25,7 +25,7 @@ /** * struct memory_group - a logical group of memory blocks * @nid: The node id for all memory blocks inside the memory group. - * @blocks: List of all memory blocks belonging to this memory group. + * @memory_blocks: List of all memory blocks belonging to this memory group. * @present_kernel_pages: Present (online) memory outside ZONE_MOVABLE of this * memory group. * @present_movable_pages: Present (online) memory in ZONE_MOVABLE of this -- cgit v1.2.3 From 67914ac08604345f620566ccf5bac87b40d5881d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Mar 2025 17:05:32 -0400 Subject: mm: compaction: push watermark into compaction_suitable() callers Patch series "mm: reliable huge page allocator". This series makes changes to the allocator and reclaim/compaction code to try harder to avoid fragmentation. As a result, this makes huge page allocations cheaper, more reliable and more sustainable. It's a subset of the huge page allocator RFC initially proposed here: https://lore.kernel.org/lkml/20230418191313.268131-1-hannes@cmpxchg.org/ The following results are from a kernel build test, with additional concurrent bursts of THP allocations on a memory-constrained system. Comparing before and after the changes over 15 runs: before after Hugealloc Time mean 52739.45 ( +0.00%) 28904.00 ( -45.19%) Hugealloc Time stddev 56541.26 ( +0.00%) 33464.37 ( -40.81%) Kbuild Real time 197.47 ( +0.00%) 196.59 ( -0.44%) Kbuild User time 1240.49 ( +0.00%) 1231.67 ( -0.71%) Kbuild System time 70.08 ( +0.00%) 59.10 ( -15.45%) THP fault alloc 46727.07 ( +0.00%) 63223.67 ( +35.30%) THP fault fallback 21910.60 ( +0.00%) 5412.47 ( -75.29%) Direct compact fail 195.80 ( +0.00%) 59.07 ( -69.48%) Direct compact success 7.93 ( +0.00%) 2.80 ( -57.46%) Direct compact success rate % 3.51 ( +0.00%) 3.99 ( +10.49%) Compact daemon scanned migrate 3369601.27 ( +0.00%) 2267500.33 ( -32.71%) Compact daemon scanned free 5075474.47 ( +0.00%) 2339773.00 ( -53.90%) Compact direct scanned migrate 161787.27 ( +0.00%) 47659.93 ( -70.54%) Compact direct scanned free 163467.53 ( +0.00%) 40729.67 ( -75.08%) Compact total migrate scanned 3531388.53 ( +0.00%) 2315160.27 ( -34.44%) Compact total free scanned 5238942.00 ( +0.00%) 2380502.67 ( -54.56%) Alloc stall 2371.07 ( +0.00%) 638.87 ( -73.02%) Pages kswapd scanned 2160926.73 ( +0.00%) 4002186.33 ( +85.21%) Pages kswapd reclaimed 533191.07 ( +0.00%) 718577.80 ( +34.77%) Pages direct scanned 400450.33 ( +0.00%) 355172.73 ( -11.31%) Pages direct reclaimed 94441.73 ( +0.00%) 31162.80 ( -67.00%) Pages total scanned 2561377.07 ( +0.00%) 4357359.07 ( +70.12%) Pages total reclaimed 627632.80 ( +0.00%) 749740.60 ( +19.46%) Swap out 47959.53 ( +0.00%) 110084.33 ( +129.53%) Swap in 7276.00 ( +0.00%) 24457.00 ( +236.10%) File refaults 138043.00 ( +0.00%) 188226.93 ( +36.35%) THP latencies are cut in half, and failure rates are cut by 75%. These metrics also hold up over time, while the vanilla kernel sees a steady downward trend in success rates with each subsequent run, owed to the cumulative effects of fragmentation. A more detailed discussion of results is in the patch changelogs. The patches first introduce a vm.defrag_mode sysctl, which enforces the existing ALLOC_NOFRAGMENT alloc flag until after reclaim and compaction have run. They then change kswapd and kcompactd to target pageblocks, which boosts success in the ALLOC_NOFRAGMENT hotpaths. Patches #1 and #2 are somewhat unrelated cleanups, but touch the same code and so are included here to avoid conflicts from re-ordering. This patch (of 5): compaction_suitable() hardcodes the min watermark, with a boost to the low watermark for costly orders. However, compaction_ready() requires order-0 at the high watermark. It currently checks the marks twice. Make the watermark a parameter to compaction_suitable() and have the callers pass in what they require: - compaction_zonelist_suitable() is used by the direct reclaim path, so use the min watermark. - compact_suit_allocation_order() has a watermark in context derived from cc->alloc_flags. The only quirk is that kcompactd doesn't initialize cc->alloc_flags explicitly. There is a direct check in kcompactd_do_work() that passes ALLOC_WMARK_MIN, but there is another check downstack in compact_zone() that ends up passing the unset alloc_flags. Since they default to 0, and that coincides with ALLOC_WMARK_MIN, it is correct. But it's subtle. Set cc->alloc_flags explicitly. - should_continue_reclaim() is direct reclaim, use the min watermark. - Finally, consolidate the two checks in compaction_ready() to a single compaction_suitable() call passing the high watermark. There is a tiny change in behavior: before, compaction_suitable() would check order-0 against min or low, depending on costly order. Then there'd be another high watermark check. Now, the high watermark is passed to compaction_suitable(), and the costly order-boost (low - min) is added on top. This means compaction_ready() sets a marginally higher target for free pages. In a kernelbuild + THP pressure test, though, this didn't show any measurable negative effects on memory pressure or reclaim rates. As the comment above the check says, reclaim is usually stopped short on should_continue_reclaim(), and this just defines the worst-case reclaim cutoff in case compaction is not making any headway. [hughd@google.com: stop oops on out-of-range highest_zoneidx] Link: https://lkml.kernel.org/r/005ace8b-07fa-01d4-b54b-394a3e029c07@google.com Link: https://lkml.kernel.org/r/20250313210647.1314586-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20250313210647.1314586-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Hugh Dickins Acked-by: Zi Yan Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compaction.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 7bf0c521db63..173d9c07a895 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -95,7 +95,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern bool compaction_suitable(struct zone *zone, int order, - int highest_zoneidx); + unsigned long watermark, int highest_zoneidx); extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); @@ -113,7 +113,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline bool compaction_suitable(struct zone *zone, int order, - int highest_zoneidx) + unsigned long watermark, + int highest_zoneidx) { return false; } -- cgit v1.2.3 From a211c6550efcc87aa2459ca347bda10721c7a46a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Mar 2025 17:05:36 -0400 Subject: mm: page_alloc: defrag_mode kswapd/kcompactd watermarks The previous patch added pageblock_order reclaim to kswapd/kcompactd, which helps, but produces only one block at a time. Allocation stalls and THP failure rates are still higher than they could be. To adequately reflect ALLOC_NOFRAGMENT demand for pageblocks, change the watermarking for kswapd & kcompactd: instead of targeting the high watermark in order-0 pages and checking for one suitable block, simply require that the high watermark is entirely met in pageblocks. To this end, track the number of free pages within contiguous pageblocks, then change pgdat_balanced() and compact_finished() to check watermarks against this new value. This further reduces THP latencies and allocation stalls, and improves THP success rates against the previous patch: DEFRAGMODE-ASYNC DEFRAGMODE-ASYNC-WMARKS Hugealloc Time mean 34300.36 ( +0.00%) 28904.00 ( -15.73%) Hugealloc Time stddev 36390.42 ( +0.00%) 33464.37 ( -8.04%) Kbuild Real time 196.13 ( +0.00%) 196.59 ( +0.23%) Kbuild User time 1234.74 ( +0.00%) 1231.67 ( -0.25%) Kbuild System time 62.62 ( +0.00%) 59.10 ( -5.54%) THP fault alloc 57054.53 ( +0.00%) 63223.67 ( +10.81%) THP fault fallback 11581.40 ( +0.00%) 5412.47 ( -53.26%) Direct compact fail 107.80 ( +0.00%) 59.07 ( -44.79%) Direct compact success 4.53 ( +0.00%) 2.80 ( -31.33%) Direct compact success rate % 3.20 ( +0.00%) 3.99 ( +18.66%) Compact daemon scanned migrate 5461033.93 ( +0.00%) 2267500.33 ( -58.48%) Compact daemon scanned free 5824897.93 ( +0.00%) 2339773.00 ( -59.83%) Compact direct scanned migrate 58336.93 ( +0.00%) 47659.93 ( -18.30%) Compact direct scanned free 32791.87 ( +0.00%) 40729.67 ( +24.21%) Compact total migrate scanned 5519370.87 ( +0.00%) 2315160.27 ( -58.05%) Compact total free scanned 5857689.80 ( +0.00%) 2380502.67 ( -59.36%) Alloc stall 2424.60 ( +0.00%) 638.87 ( -73.62%) Pages kswapd scanned 2657018.33 ( +0.00%) 4002186.33 ( +50.63%) Pages kswapd reclaimed 559583.07 ( +0.00%) 718577.80 ( +28.41%) Pages direct scanned 722094.07 ( +0.00%) 355172.73 ( -50.81%) Pages direct reclaimed 107257.80 ( +0.00%) 31162.80 ( -70.95%) Pages total scanned 3379112.40 ( +0.00%) 4357359.07 ( +28.95%) Pages total reclaimed 666840.87 ( +0.00%) 749740.60 ( +12.43%) Swap out 77238.20 ( +0.00%) 110084.33 ( +42.53%) Swap in 11712.80 ( +0.00%) 24457.00 ( +108.80%) File refaults 143438.80 ( +0.00%) 188226.93 ( +31.22%) Also of note is that compaction work overall is reduced. The reason for this is that when free pageblocks are more readily available, allocations are also much more likely to get physically placed in LRU order, instead of being forced to scavenge free space here and there. This means that reclaim by itself has better chances of freeing up whole blocks, and the system relies less on compaction. Comparing all changes to the vanilla kernel: VANILLA DEFRAGMODE-ASYNC-WMARKS Hugealloc Time mean 52739.45 ( +0.00%) 28904.00 ( -45.19%) Hugealloc Time stddev 56541.26 ( +0.00%) 33464.37 ( -40.81%) Kbuild Real time 197.47 ( +0.00%) 196.59 ( -0.44%) Kbuild User time 1240.49 ( +0.00%) 1231.67 ( -0.71%) Kbuild System time 70.08 ( +0.00%) 59.10 ( -15.45%) THP fault alloc 46727.07 ( +0.00%) 63223.67 ( +35.30%) THP fault fallback 21910.60 ( +0.00%) 5412.47 ( -75.29%) Direct compact fail 195.80 ( +0.00%) 59.07 ( -69.48%) Direct compact success 7.93 ( +0.00%) 2.80 ( -57.46%) Direct compact success rate % 3.51 ( +0.00%) 3.99 ( +10.49%) Compact daemon scanned migrate 3369601.27 ( +0.00%) 2267500.33 ( -32.71%) Compact daemon scanned free 5075474.47 ( +0.00%) 2339773.00 ( -53.90%) Compact direct scanned migrate 161787.27 ( +0.00%) 47659.93 ( -70.54%) Compact direct scanned free 163467.53 ( +0.00%) 40729.67 ( -75.08%) Compact total migrate scanned 3531388.53 ( +0.00%) 2315160.27 ( -34.44%) Compact total free scanned 5238942.00 ( +0.00%) 2380502.67 ( -54.56%) Alloc stall 2371.07 ( +0.00%) 638.87 ( -73.02%) Pages kswapd scanned 2160926.73 ( +0.00%) 4002186.33 ( +85.21%) Pages kswapd reclaimed 533191.07 ( +0.00%) 718577.80 ( +34.77%) Pages direct scanned 400450.33 ( +0.00%) 355172.73 ( -11.31%) Pages direct reclaimed 94441.73 ( +0.00%) 31162.80 ( -67.00%) Pages total scanned 2561377.07 ( +0.00%) 4357359.07 ( +70.12%) Pages total reclaimed 627632.80 ( +0.00%) 749740.60 ( +19.46%) Swap out 47959.53 ( +0.00%) 110084.33 ( +129.53%) Swap in 7276.00 ( +0.00%) 24457.00 ( +236.10%) File refaults 138043.00 ( +0.00%) 188226.93 ( +36.35%) THP allocation latencies and %sys time are down dramatically. THP allocation failures are down from nearly 50% to 8.5%. And to recall previous data points, the success rates are steady and reliable without the cumulative deterioration of fragmentation events. Compaction work is down overall. Direct compaction work especially is drastically reduced. As an aside, its success rate of 4% indicates there is room for improvement. For now it's good to rely on it less. Reclaim work is up overall, however direct reclaim work is down. Part of the increase can be attributed to a higher use of THPs, which due to internal fragmentation increase the memory footprint. This is not necessarily an unexpected side-effect for users of THP. However, taken both points together, there may well be some opportunities for fine tuning in the reclaim/compaction coordination. [hannes@cmpxchg.org: fix squawks from rebasing] Link: https://lkml.kernel.org/r/20250314210558.GD1316033@cmpxchg.org Link: https://lkml.kernel.org/r/20250313210647.1314586-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dbb0ad69e17f..37c29f3fbca8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -138,6 +138,7 @@ enum numa_stat_item { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, + NR_FREE_PAGES_BLOCKS, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, -- cgit v1.2.3 From 79f88a584e35133359c394506b351a60230cf37b Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 7 Mar 2025 18:35:58 +0100 Subject: net: ethtool: Export the link_mode_params definitions link_mode_params contains a lookup table of all 802.3 link modes that are currently supported with structured data about each mode's speed, duplex, number of lanes and mediums. As a preparation for a port representation, export that table for the rest of the net stack to use. Signed-off-by: Maxime Chevallier Link: https://patch.msgid.link/20250307173611.129125-2-maxime.chevallier@bootlin.com Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 7f222dccc7d1..8210ece94fa6 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -210,6 +210,14 @@ static inline u8 *ethtool_rxfh_context_key(struct ethtool_rxfh_context *ctx) void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); +struct link_mode_info { + int speed; + u8 lanes; + u8 duplex; +}; + +extern const struct link_mode_info link_mode_params[]; + /* declare a link mode bitmap */ #define __ETHTOOL_DECLARE_LINK_MODE_MASK(name) \ DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS) -- cgit v1.2.3 From 8c8c4a87933dd924f9fb56dfd35bae7e8f30a4b5 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 7 Mar 2025 18:36:00 +0100 Subject: net: phy: phy_caps: Move phy_speeds to phy_caps Use the newly introduced link_capabilities array to derive the list of possible speeds when given a combination of linkmodes. As link_capabilities is indexed by speed, we don't have to iterate the whole phy_settings array. Signed-off-by: Maxime Chevallier Link: https://patch.msgid.link/20250307173611.129125-4-maxime.chevallier@bootlin.com Signed-off-by: Paolo Abeni --- include/linux/phy.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 61a8cb9d1247..83c50bb21939 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1287,8 +1287,6 @@ struct phy_setting { const struct phy_setting * phy_lookup_setting(int speed, int duplex, const unsigned long *mask, bool exact); -size_t phy_speeds(unsigned int *speeds, size_t size, - unsigned long *mask); /** * phy_is_started - Convenience function to check whether PHY is started -- cgit v1.2.3 From ce60fef7feccb5c71d5b49e489d24db7d79c2ac7 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 7 Mar 2025 18:36:07 +0100 Subject: net: phy: drop phy_settings and the associated lookup helpers The phy_settings array is no longer relevant as it has now been replaced by the link_caps array and associated phy_caps helpers. Signed-off-by: Maxime Chevallier Link: https://patch.msgid.link/20250307173611.129125-11-maxime.chevallier@bootlin.com Signed-off-by: Paolo Abeni --- include/linux/phy.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 83c50bb21939..c24e1a565819 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1275,19 +1275,6 @@ const char *phy_rate_matching_to_str(int rate_matching); int phy_interface_num_ports(phy_interface_t interface); -/* A structure for mapping a particular speed and duplex - * combination to a particular SUPPORTED and ADVERTISED value - */ -struct phy_setting { - u32 speed; - u8 duplex; - u8 bit; -}; - -const struct phy_setting * -phy_lookup_setting(int speed, int duplex, const unsigned long *mask, - bool exact); - /** * phy_is_started - Convenience function to check whether PHY is started * @phydev: The phy_device struct -- cgit v1.2.3 From f2aac4c73c9945cce156fd58a9a2f31f2c8a90c7 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 17 Mar 2025 18:03:49 +0100 Subject: ata: libata-core: Add ATA_QUIRK_NO_LPM_ON_ATI for certain Samsung SSDs Before commit 7627a0edef54 ("ata: ahci: Drop low power policy board type") the ATI AHCI controllers specified board type 'board_ahci' rather than board type 'board_ahci'. This means that LPM was historically not enabled for the ATI AHCI controllers. By looking at commit 7a8526a5cd51 ("libata: Add ATA_HORKAGE_NO_NCQ_ON_ATI for Samsung 860 and 870 SSD."), it is clear that, for some unknown reason, that Samsung SSDs do not play nice with ATI AHCI controllers. (When using other AHCI controllers, NCQ can be enabled on these Samsung SSDs without issues.) In a similar way, from user reports, it is clear the ATI AHCI controllers can enable LPM on e.g. Maxtor HDDs perfectly fine, but when enabling LPM on certain Samsung SSDs, things break. (E.g. the SSDs will not get detected by the ATI AHCI controller even after a COMRESET.) Yet, when using LPM on these Samsung SSDs with other AHCI controllers, e.g. Intel AHCI controllers, these Samsung drives appear to work perfectly fine. Considering that the combination of ATI + Samsung, for some unknown reason, does not seem to work well, disable LPM when detecting an ATI AHCI controller with a problematic Samsung SSD. Apply this new ATA_QUIRK_NO_LPM_ON_ATI quirk for all Samsung SSDs that have already been reported to not play nice with ATI (ATA_QUIRK_NO_NCQ_ON_ATI). Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type") Suggested-by: Hans de Goede Reported-by: Eric Closes: https://lore.kernel.org/linux-ide/Z8SBZMBjvVXA7OAK@eldamar.lan/ Tested-by: Eric Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20250317170348.1748671-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- include/linux/libata.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index c1a85d46eba6..4f62c43059c2 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -88,6 +88,7 @@ enum ata_quirks { __ATA_QUIRK_MAX_SEC_1024, /* Limit max sects to 1024 */ __ATA_QUIRK_MAX_TRIM_128M, /* Limit max trim size to 128M */ __ATA_QUIRK_NO_NCQ_ON_ATI, /* Disable NCQ on ATI chipset */ + __ATA_QUIRK_NO_LPM_ON_ATI, /* Disable LPM on ATI chipset */ __ATA_QUIRK_NO_ID_DEV_LOG, /* Identify device log missing */ __ATA_QUIRK_NO_LOG_DIR, /* Do not read log directory */ __ATA_QUIRK_NO_FUA, /* Do not use FUA */ @@ -432,6 +433,7 @@ enum { ATA_QUIRK_MAX_SEC_1024 = (1U << __ATA_QUIRK_MAX_SEC_1024), ATA_QUIRK_MAX_TRIM_128M = (1U << __ATA_QUIRK_MAX_TRIM_128M), ATA_QUIRK_NO_NCQ_ON_ATI = (1U << __ATA_QUIRK_NO_NCQ_ON_ATI), + ATA_QUIRK_NO_LPM_ON_ATI = (1U << __ATA_QUIRK_NO_LPM_ON_ATI), ATA_QUIRK_NO_ID_DEV_LOG = (1U << __ATA_QUIRK_NO_ID_DEV_LOG), ATA_QUIRK_NO_LOG_DIR = (1U << __ATA_QUIRK_NO_LOG_DIR), ATA_QUIRK_NO_FUA = (1U << __ATA_QUIRK_NO_FUA), -- cgit v1.2.3 From d375db42a8effdfeb7973d4f1b0b5985e066fd28 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Mar 2025 16:18:41 +0200 Subject: RDMA/mlx5: Add optional counters for RDMA_TX/RX_packets/bytes Add the following optional counters: rdma_tx_packets,rdma_rx_bytes,rdma_rx_packets,rdma_tx_bytes. Which counts all RDMA packets/bytes sent and received per link. Note that since each direction packet and byte counter are shared, the counter is only reset when both counters of that direction are removed. But from user-perspective each can be enabled/disabled separately. The counters can be enabled using: sudo rdma stat set link rocep8s0f0/1 optional-counters rdma_tx_packets And can be seen using: rdma stat -j show link rocep8s0f0/1 Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Link: https://patch.msgid.link/9f2753ad636f21704416df64b47395c8991d1123.1741875070.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 8fe56d0362c6..63f0d9fb94b4 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1532,8 +1532,8 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } -#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 2 -#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 1 +#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 3 +#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 2 #define MLX5_BY_PASS_NUM_REGULAR_PRIOS 16 #define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 16 #define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1 -- cgit v1.2.3 From fd24c9ef6c8f12fd045524c7cae1992f7967e94b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Mar 2025 16:18:46 +0200 Subject: RDMA/mlx5: Support optional-counters binding for QPs Add support to allow optional-counters binding to a QP, whereas when a bind operation is requested depending on the counter optional-counter binding state the driver will determine if to also add optional-counters to this QP binding. The optional-counter binding is done by simply adding a steering rule for the specific optional-counter condition with the additional match over that QP number. Note that optional-counters per QP rules are handled on an earlier prio than per device counters, and per device counter correctness is maintained by core whereas it is responsible to sum active counters when checking device counter and to add them to history count when they are deallocated. Signed-off-by: Patrisious Haddad Reviewed-by: Mark Bloch Link: https://patch.msgid.link/2cad1b891a6641ae61fe8d92f867e1059121813a.1741875070.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 63f0d9fb94b4..344124644697 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1532,8 +1532,8 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } -#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 3 -#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 2 +#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 6 +#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 4 #define MLX5_BY_PASS_NUM_REGULAR_PRIOS 16 #define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 16 #define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1 -- cgit v1.2.3 From 8d4880db378350f8ed8969feea13bdc164564fc1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 11 Mar 2025 21:42:28 +0100 Subject: udp_tunnel: create a fastpath GRO lookup. Most UDP tunnels bind a socket to a local port, with ANY address, no peer and no interface index specified. Additionally it's quite common to have a single tunnel device per namespace. Track in each namespace the UDP tunnel socket respecting the above. When only a single one is present, store a reference in the netns. When such reference is not NULL, UDP tunnel GRO lookup just need to match the incoming packet destination port vs the socket local port. The tunnel socket never sets the reuse[port] flag[s]. When bound to no address and interface, no other socket can exist in the same netns matching the specified local port. Matching packets with non-local destination addresses will be aggregated, and eventually segmented as needed - no behavior changes intended. Note that the UDP tunnel socket reference is stored into struct netns_ipv4 for both IPv4 and IPv6 tunnels. That is intentional to keep all the fastpath-related netns fields in the same struct and allow cacheline-based optimization. Currently both the IPv4 and IPv6 socket pointer share the same cacheline as the `udp_table` field. Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/4d5c319c4471161829f50cb8436841de81a5edae.1741718157.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- include/linux/udp.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 0807e21cfec9..895240177f4f 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,6 +101,13 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; + + /* + * Accounting for the tunnel GRO fastpath. + * Unprotected by compilers guard, as it uses space available in + * the last UDP socket cacheline. + */ + struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) +static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) +{ +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); +#else + return NULL; +#endif +} + #endif /* _LINUX_UDP_H */ -- cgit v1.2.3 From 24faa63bcea88b6f24b0a3a710708505a876f9ba Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 12 Mar 2025 14:34:50 +0800 Subject: net: skbuff: Remove unused skb_add_data() Since commit a4ea4c477619 ("rxrpc: Don't use a ring buffer for call Tx queue") this function is not used anymore. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250312063450.183652-1-yuehaibing@huawei.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b8a1343d6785..cd8294cdc249 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3867,25 +3867,6 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) __must_check; -static inline int skb_add_data(struct sk_buff *skb, - struct iov_iter *from, int copy) -{ - const int off = skb->len; - - if (skb->ip_summed == CHECKSUM_NONE) { - __wsum csum = 0; - if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy, - &csum, from)) { - skb->csum = csum_block_add(skb->csum, csum, off); - return 0; - } - } else if (copy_from_iter_full(skb_put(skb, copy), copy, from)) - return 0; - - __skb_trim(skb, off); - return -EFAULT; -} - static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { -- cgit v1.2.3 From 611851010c74046c0bc2b0461b72a6fae81c16d0 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 13 Mar 2025 15:27:44 +0100 Subject: fs: dedup handling of struct filename init and refcounts bumps No functional changes. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250313142744.1323281-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ab789cd1531..ae1fce54eb60 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2859,6 +2859,12 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f } extern void putname(struct filename *name); +static inline struct filename *refname(struct filename *name) +{ + atomic_inc(&name->refcnt); + return name; +} + extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); -- cgit v1.2.3 From b28f47ac3ddd072cce0e447ace89e2b4d6932c66 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 4 Mar 2025 12:05:39 +0100 Subject: mtd: spinand: Improve spinand_info macros style Let's assume all these macros should not have a trailing comma, this way the caller can use a more formal and usual C writing style, as reflected in the Macronix driver. Acked-by: Pratyush Yadav Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 5837a09ab9d8..1e748958dad4 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -502,10 +502,10 @@ struct spinand_info { } #define SPINAND_SELECT_TARGET(__func) \ - .select_target = __func, + .select_target = __func #define SPINAND_CONT_READ(__set_cont_read) \ - .set_cont_read = __set_cont_read, + .set_cont_read = __set_cont_read #define SPINAND_FACT_OTP_INFO(__npages, __start_page, __ops) \ .fact_otp = { \ @@ -526,8 +526,8 @@ struct spinand_info { } #define SPINAND_READ_RETRY(__read_retries, __set_read_retry) \ - .read_retries = __read_retries, \ - .set_read_retry = __set_read_retry, + .read_retries = __read_retries, \ + .set_read_retry = __set_read_retry #define SPINAND_INFO(__model, __id, __memorg, __eccreq, __op_variants, \ __flags, ...) \ -- cgit v1.2.3 From ca8cbbb2be8f906d9602a6e4324f8adf279e9cc2 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 5 Mar 2025 20:49:55 +0100 Subject: mtd: nand: Fix a kdoc comment The max_bad_eraseblocks_per_lun member of nand_device obviously describes a number of *maximum* number of bad eraseblocks per LUN. Fix this obvious typo. Fixes: 377e517b5fa5 ("mtd: nand: Add max_bad_eraseblocks_per_lun info to memorg") Cc: # fix kdoc comment Reviewed-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/nand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 0e2f228e8b4a..07486168d104 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -21,7 +21,7 @@ struct nand_device; * @oobsize: OOB area size * @pages_per_eraseblock: number of pages per eraseblock * @eraseblocks_per_lun: number of eraseblocks per LUN (Logical Unit Number) - * @max_bad_eraseblocks_per_lun: maximum number of eraseblocks per LUN + * @max_bad_eraseblocks_per_lun: maximum number of bad eraseblocks per LUN * @planes_per_lun: number of planes per LUN * @luns_per_target: number of LUN per target (target is a synonym for die) * @ntargets: total number of targets exposed by the NAND device -- cgit v1.2.3 From db5e8ea155fc1d89c87cb81f0e4a681a77b9b03f Mon Sep 17 00:00:00 2001 From: Jan Glaza Date: Tue, 4 Mar 2025 12:08:31 +0100 Subject: virtchnl: make proto and filter action count unsigned The count field in virtchnl_proto_hdrs and virtchnl_filter_action_set should never be negative while still being valid. Changing it from int to u32 ensures proper handling of values in virtchnl messages in driverrs and prevents unintended behavior. In its current signed form, a negative count does not trigger an error in ice driver but instead results in it being treated as 0. This can lead to unexpected outcomes when processing messages. By using u32, any invalid values will correctly trigger -EINVAL, making error detection more robust. Fixes: 1f7ea1cd6a374 ("ice: Enable FDIR Configure for AVF") Reviewed-by: Jedrzej Jagielski Reviewed-by: Simon Horman Signed-off-by: Jan Glaza Signed-off-by: Martyna Szapar-Mudlaw Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 13a11f3c09b8..aca06f300f83 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -1283,7 +1283,7 @@ struct virtchnl_proto_hdrs { * 2 - from the second inner layer * .... **/ - int count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */ + u32 count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */ union { struct virtchnl_proto_hdr proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS]; @@ -1335,7 +1335,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(36, virtchnl_filter_action); struct virtchnl_filter_action_set { /* action number must be less then VIRTCHNL_MAX_NUM_ACTIONS */ - int count; + u32 count; struct virtchnl_filter_action actions[VIRTCHNL_MAX_NUM_ACTIONS]; }; -- cgit v1.2.3 From e36ba5ab808ef6237c3148d469c8238674230e2b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:23 -0700 Subject: iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC Introduce a new IOMMUFD_OBJ_VEVENTQ object for vIOMMU Event Queue that provides user space (VMM) another FD to read the vIOMMU Events. Allow a vIOMMU object to allocate vEVENTQs, with a condition that each vIOMMU can only have one single vEVENTQ per type. Add iommufd_veventq_alloc() with iommufd_veventq_ops for the new ioctl. Link: https://patch.msgid.link/r/21acf0751dd5c93846935ee06f93b9c65eff5e04.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 11110c749200..8948b1836940 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -34,6 +34,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_FAULT, IOMMUFD_OBJ_VIOMMU, IOMMUFD_OBJ_VDEVICE, + IOMMUFD_OBJ_VEVENTQ, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -93,6 +94,8 @@ struct iommufd_viommu { const struct iommufd_viommu_ops *ops; struct xarray vdevs; + struct list_head veventqs; + struct rw_semaphore veventqs_rwsem; unsigned int type; }; -- cgit v1.2.3 From ea94b211c5483080b749c142090f4c4de4926e51 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:24 -0700 Subject: iommufd/viommu: Add iommufd_viommu_get_vdev_id helper This is a reverse search v.s. iommufd_viommu_find_dev, as drivers may want to convert a struct device pointer (physical) to its virtual device ID for an event injection to the user space VM. Again, this avoids exposing more core structures to the drivers, than the iommufd_viommu alone. Link: https://patch.msgid.link/r/18b8e8bc1b8104d43b205d21602c036fd0804e56.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 8948b1836940..05cb393aff0a 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -190,6 +190,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, enum iommufd_object_type type); struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); +int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, unsigned long *vdev_id); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -203,6 +205,13 @@ iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) { return NULL; } + +static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, + unsigned long *vdev_id) +{ + return -ENOENT; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* -- cgit v1.2.3 From e8e1ef9b77a7a09b7809890a52229f24d3c8b532 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:25 -0700 Subject: iommufd/viommu: Add iommufd_viommu_report_event helper Similar to iommu_report_device_fault, this allows IOMMU drivers to report vIOMMU events from threaded IRQ handlers to user space hypervisors. Link: https://patch.msgid.link/r/44be825042c8255e75d0151b338ffd8ba0e4920b.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 05cb393aff0a..60eff9272551 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -11,6 +11,7 @@ #include #include #include +#include struct device; struct file; @@ -192,6 +193,9 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, struct device *dev, unsigned long *vdev_id); +int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, void *event_data, + size_t data_len); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -212,6 +216,13 @@ static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, { return -ENOENT; } + +static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, + void *event_data, size_t data_len) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* -- cgit v1.2.3 From ae0a457f5d33c336f3c4259a258f8b537531a04b Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Mon, 17 Mar 2025 23:07:53 -0400 Subject: bpf: Make perf_event_read_output accessible in all program types. The perf_event_read_event_output helper is currently only available to tracing protrams, but is useful for other BPF programs like sched_ext schedulers. When the helper is available, provide its bpf_func_proto directly from the bpf base_proto. Signed-off-by: Emil Tsalapatis (Meta) Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20250318030753.10949-1-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0d7b70124d81..973a88d9b52b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2059,6 +2059,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void); +const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void); + typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, unsigned long off, unsigned long len); typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, -- cgit v1.2.3 From e794dc30be8b69e44646a162db09b43f719525b7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 16:07:15 +0200 Subject: i2c: Introduce i2c_10bit_addr_*_from_msg() helpers There are already a lot of drivers that have been using i2c_8bit_addr_from_msg() for 7-bit addresses, now it's time to have the similar for 10-bit addresses. Signed-off-by: Andy Shevchenko Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250213141045.2716943-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andi Shyti --- include/linux/i2c.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 2b2af24d2a43..1e35b1e23165 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -952,6 +952,21 @@ static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg) return (msg->addr << 1) | (msg->flags & I2C_M_RD); } +/* + * 10-bit address + * addr_1: 5'b11110 | addr[9:8] | (R/nW) + * addr_2: addr[7:0] + */ +static inline u8 i2c_10bit_addr_hi_from_msg(const struct i2c_msg *msg) +{ + return 0xf0 | ((msg->addr & GENMASK(9, 8)) >> 7) | (msg->flags & I2C_M_RD); +} + +static inline u8 i2c_10bit_addr_lo_from_msg(const struct i2c_msg *msg) +{ + return msg->addr & GENMASK(7, 0); +} + u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold); void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred); -- cgit v1.2.3 From 00d7fc04b703eb3e9d61dd3eac02a34c466e9f12 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Tue, 11 Mar 2025 08:02:36 -0700 Subject: x86/cpu: Add cpu_type to struct x86_cpu_id In addition to matching vendor/family/model/feature, for hybrid variants it is required to also match cpu-type. For example, some CPU vulnerabilities like RFDS only affect a specific cpu-type. To be able to also match CPUs based on their type, add a new field "type" to struct x86_cpu_id which is used by the CPU-matching tables. Introduce X86_CPU_TYPE_ANY for the cases that don't care about the cpu-type. [ bp: Massage commit message. ] Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20250311-add-cpu-type-v8-3-e8514dcaaff2@linux.intel.com --- include/linux/mod_devicetable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index d67614f7b7f1..bd7e60c0b72f 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -692,6 +692,7 @@ struct x86_cpu_id { __u16 feature; /* bit index */ /* Solely for kernel-internal use: DO NOT EXPORT to userspace! */ __u16 flags; + __u8 type; kernel_ulong_t driver_data; }; @@ -703,6 +704,7 @@ struct x86_cpu_id { #define X86_STEP_MIN 0 #define X86_STEP_MAX 0xf #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ +#define X86_CPU_TYPE_ANY 0 /* * Generic table type for matching CPU features. -- cgit v1.2.3 From 97eb35f3ad42de1c932ef1f7e2f0044d4fca35f4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 15 Mar 2025 21:05:38 -0700 Subject: bpf: Introduce rqspinlock kfuncs Introduce four new kfuncs, bpf_res_spin_lock, and bpf_res_spin_unlock, and their irqsave/irqrestore variants, which wrap the rqspinlock APIs. bpf_res_spin_lock returns a conditional result, depending on whether the lock was acquired (NULL is returned when lock acquisition succeeds, non-NULL upon failure). The memory pointed to by the returned pointer upon failure can be dereferenced after the NULL check to obtain the error code. Instead of using the old bpf_spin_lock type, introduce a new type with the same layout, and the same alignment, but a different name to avoid type confusion. Preemption is disabled upon successful lock acquisition, however IRQs are not. Special kfuncs can be introduced later to allow disabling IRQs when taking a spin lock. Resilient locks are safe against AA deadlocks, hence not disabling IRQs currently does not allow violation of kernel safety. __irq_flag annotation is used to accept IRQ flags for the IRQ-variants, with the same semantics as existing bpf_local_irq_{save, restore}. These kfuncs will require additional verifier-side support in subsequent commits, to allow programs to hold multiple locks at the same time. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250316040541.108729-23-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 973a88d9b52b..e3928fb19c2c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -30,6 +30,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; -- cgit v1.2.3 From 0de2046137f976e7302d43ac01d9894d07ac1fff Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 15 Mar 2025 21:05:39 -0700 Subject: bpf: Implement verifier support for rqspinlock Introduce verifier-side support for rqspinlock kfuncs. The first step is allowing bpf_res_spin_lock type to be defined in map values and allocated objects, so BTF-side is updated with a new BPF_RES_SPIN_LOCK field to recognize and validate. Any object cannot have both bpf_spin_lock and bpf_res_spin_lock, only one of them (and at most one of them per-object, like before) must be present. The bpf_res_spin_lock can also be used to protect objects that require lock protection for their kfuncs, like BPF rbtree and linked list. The verifier plumbing to simulate success and failure cases when calling the kfuncs is done by pushing a new verifier state to the verifier state stack which will verify the failure case upon calling the kfunc. The path where success is indicated creates all lock reference state and IRQ state (if necessary for irqsave variants). In the case of failure, the state clears the registers r0-r5, sets the return value, and skips kfunc processing, proceeding to the next instruction. When marking the return value for success case, the value is marked as 0, and for the failure case as [-MAX_ERRNO, -1]. Then, in the program, whenever user checks the return value as 'if (ret)' or 'if (ret < 0)' the verifier never traverses such branches for success cases, and would be aware that the lock is not held in such cases. We push the kfunc state in check_kfunc_call whenever rqspinlock kfuncs are invoked. We introduce a kfunc_class state to avoid mixing lock irqrestore kfuncs with IRQ state created by bpf_local_irq_save. With all this infrastructure, these kfuncs become usable in programs while satisfying all safety properties required by the kernel. Acked-by: Eduard Zingerman Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250316040541.108729-24-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++++ include/linux/bpf_verifier.h | 16 +++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e3928fb19c2c..7a87ecfa2c3f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -205,6 +205,7 @@ enum btf_field_type { BPF_REFCOUNT = (1 << 9), BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), + BPF_RES_SPIN_LOCK = (1 << 12), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -240,6 +241,7 @@ struct btf_record { u32 cnt; u32 field_mask; int spin_lock_off; + int res_spin_lock_off; int timer_off; int wq_off; int refcount_off; @@ -315,6 +317,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) switch (type) { case BPF_SPIN_LOCK: return "bpf_spin_lock"; + case BPF_RES_SPIN_LOCK: + return "bpf_res_spin_lock"; case BPF_TIMER: return "bpf_timer"; case BPF_WORKQUEUE: @@ -347,6 +351,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) switch (type) { case BPF_SPIN_LOCK: return sizeof(struct bpf_spin_lock); + case BPF_RES_SPIN_LOCK: + return sizeof(struct bpf_res_spin_lock); case BPF_TIMER: return sizeof(struct bpf_timer); case BPF_WORKQUEUE: @@ -377,6 +383,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) switch (type) { case BPF_SPIN_LOCK: return __alignof__(struct bpf_spin_lock); + case BPF_RES_SPIN_LOCK: + return __alignof__(struct bpf_res_spin_lock); case BPF_TIMER: return __alignof__(struct bpf_timer); case BPF_WORKQUEUE: @@ -420,6 +428,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_RB_ROOT: /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_KPTR_UNREF: diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d6cfc4ee6820..bc073a48aed9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -115,6 +115,14 @@ struct bpf_reg_state { int depth:30; } iter; + /* For irq stack slots */ + struct { + enum { + IRQ_NATIVE_KFUNC, + IRQ_LOCK_KFUNC, + } kfunc_class; + } irq; + /* Max size from any of the above. */ struct { unsigned long raw1; @@ -255,9 +263,11 @@ struct bpf_reference_state { * default to pointer reference on zero initialization of a state. */ enum ref_state_type { - REF_TYPE_PTR = 1, - REF_TYPE_IRQ = 2, - REF_TYPE_LOCK = 3, + REF_TYPE_PTR = (1 << 1), + REF_TYPE_IRQ = (1 << 2), + REF_TYPE_LOCK = (1 << 3), + REF_TYPE_RES_LOCK = (1 << 4), + REF_TYPE_RES_LOCK_IRQ = (1 << 5), } type; /* Track each reference created with a unique id, even if the same * instruction creates the reference multiple times (eg, via CALL). -- cgit v1.2.3 From ea21771c077c7aa85d46dd021d03eb0d96b5f418 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 15 Mar 2025 21:05:40 -0700 Subject: bpf: Maintain FIFO property for rqspinlock unlock Since out-of-order unlocks are unsupported for rqspinlock, and irqsave variants enforce strict FIFO ordering anyway, make the same change for normal non-irqsave variants, such that FIFO ordering is enforced. Two new verifier state fields (active_lock_id, active_lock_ptr) are used to denote the top of the stack, and prev_id and prev_ptr are ascertained whenever popping the topmost entry through an unlock. Take special care to make these fields part of the state comparison in refsafe. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250316040541.108729-25-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bc073a48aed9..9734544b6957 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -268,6 +268,7 @@ struct bpf_reference_state { REF_TYPE_LOCK = (1 << 3), REF_TYPE_RES_LOCK = (1 << 4), REF_TYPE_RES_LOCK_IRQ = (1 << 5), + REF_TYPE_LOCK_MASK = REF_TYPE_LOCK | REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, } type; /* Track each reference created with a unique id, even if the same * instruction creates the reference multiple times (eg, via CALL). @@ -434,6 +435,8 @@ struct bpf_verifier_state { u32 active_locks; u32 active_preempt_locks; u32 active_irq_id; + u32 active_lock_id; + void *active_lock_ptr; bool active_rcu_lock; bool speculative; -- cgit v1.2.3 From 575e7b0629d4bd485517c40ff20676180476f5f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Mar 2025 06:12:47 +0000 Subject: io_uring: rename the data cmd cache Pick a more descriptive name for the cmd async data cache. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/20250319061251.21452-2-sidong.yang@furiosa.ai Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0e87e292bfb5..c17d2eedf478 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -318,7 +318,7 @@ struct io_ring_ctx { struct io_alloc_cache apoll_cache; struct io_alloc_cache netmsg_cache; struct io_alloc_cache rw_cache; - struct io_alloc_cache uring_cache; + struct io_alloc_cache cmd_cache; /* * Any cancelable uring_cmd is added to this list in -- cgit v1.2.3 From 0c1f1eb65425451b8bcde52c055803cddecd1151 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 12 Mar 2025 09:34:25 +0000 Subject: net: stmmac: allow platforms to use PHY tx clock stop capability Allow platform glue to instruct stmmac to make use of the PHY transmit clock stop capability when deciding whether to allow the transmit clock from the DWMAC core to be stopped. Reviewed-by: Lad Prabhakar Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tsITp-005vG9-Px@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- include/linux/stmmac.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index b6f03ab12595..c4ec8bb8144e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -183,7 +183,8 @@ struct dwmac4_addrs { #define STMMAC_FLAG_INT_SNAPSHOT_EN BIT(9) #define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI BIT(10) #define STMMAC_FLAG_EN_TX_LPI_CLOCKGATING BIT(11) -#define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY BIT(12) +#define STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP BIT(12) +#define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY BIT(13) struct plat_stmmacenet_data { int bus_id; -- cgit v1.2.3 From 8033d2aef51722fe74068b52553625ed91ea256c Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 12 Mar 2025 12:05:12 -0700 Subject: Revert "net: replace dev_addr_sem with netdev instance lock" This reverts commit df43d8bf10316a7c3b1e47e3cc0057a54df4a5b8. Cc: Kohei Enju Reviewed-by: Kuniyuki Iwashima Fixes: df43d8bf1031 ("net: replace dev_addr_sem with netdev instance lock") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250312190513.1252045-2-sdf@fomichev.me Tested-by: Lei Yang Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67527243459b..0db9fc0afe36 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2498,7 +2498,7 @@ struct net_device { * * Protects: * @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list, - * @net_shaper_hierarchy, @reg_state, @threaded, @dev_addr + * @net_shaper_hierarchy, @reg_state, @threaded * * Partially protects (writers must hold both @lock and rtnl_lock): * @up @@ -4196,6 +4196,10 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); +int netif_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack); +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); -- cgit v1.2.3 From 6dd132516f8e467f144f7871ff2708ce827417a1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 12 Mar 2025 12:05:13 -0700 Subject: net: reorder dev_addr_sem lock Lockdep complains about circular lock in 1 -> 2 -> 3 (see below). Change the lock ordering to be: - rtnl_lock - dev_addr_sem - netdev_ops (only for lower devices!) - team_lock (or other per-upper device lock) 1. rtnl_lock -> netdev_ops -> dev_addr_sem rtnl_setlink rtnl_lock do_setlink IFLA_ADDRESS on lower netdev_ops dev_addr_sem 2. rtnl_lock -> team_lock -> netdev_ops rtnl_newlink rtnl_lock do_setlink IFLA_MASTER on lower do_set_master team_add_slave team_lock team_port_add dev_set_mtu netdev_ops 3. rtnl_lock -> dev_addr_sem -> team_lock rtnl_newlink rtnl_lock do_setlink IFLA_ADDRESS on upper dev_addr_sem netif_set_mac_address team_set_mac_address team_lock 4. rtnl_lock -> netdev_ops -> dev_addr_sem rtnl_lock dev_ifsioc dev_set_mac_address_user __tun_chr_ioctl rtnl_lock dev_set_mac_address_user tap_ioctl rtnl_lock dev_set_mac_address_user dev_set_mac_address_user netdev_lock_ops netif_set_mac_address_user dev_addr_sem v2: - move lock reorder to happen after kmalloc (Kuniyuki) Cc: Kohei Enju Fixes: df43d8bf1031 ("net: replace dev_addr_sem with netdev instance lock") Signed-off-by: Stanislav Fomichev Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250312190513.1252045-3-sdf@fomichev.me Tested-by: Lei Yang Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0db9fc0afe36..0c5b1f7f8f3a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4196,8 +4196,6 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); -int netif_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); -- cgit v1.2.3 From 6d6c1ba7824022528dbe3e283fafbd0775424128 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Wed, 12 Mar 2025 13:51:46 -0600 Subject: net, treewide: define and use MAC_ADDR_STR_LEN There are a few places in the tree which compute the length of the string representation of a MAC address as 3 * ETH_ALEN - 1. Define a constant for this and use it where relevant. No functionality changes are expected. Signed-off-by: Uday Shankar Reviewed-by: Michal Swiatkowski Acked-by: Johannes Berg Reviewed-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250312-netconsole-v6-1-3437933e79b8@purestorage.com Signed-off-by: Paolo Abeni --- include/linux/if_ether.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index 8a9792a6427a..61b7335aa037 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -19,6 +19,9 @@ #include #include +/* XX:XX:XX:XX:XX:XX */ +#define MAC_ADDR_STR_LEN (3 * ETH_ALEN - 1) + static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_mac_header(skb); -- cgit v1.2.3 From f8a10bed32f5fbede13a5f22fdc4ab8740ea213a Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Wed, 12 Mar 2025 13:51:47 -0600 Subject: netconsole: allow selection of egress interface via MAC address Currently, netconsole has two methods of configuration - module parameter and configfs. The former interface allows for netconsole activation earlier during boot (by specifying the module parameter on the kernel command line), so it is preferred for debugging issues which arise before userspace is up/the configfs interface can be used. The module parameter syntax requires specifying the egress interface name. This requirement makes it hard to use for a couple reasons: - The egress interface name can be hard or impossible to predict. For example, installing a new network card in a system can change the interface names assigned by the kernel. - When constructing the module parameter, one may have trouble determining the original (kernel-assigned) name of the interface (which is the name that should be given to netconsole) if some stable interface naming scheme is in effect. A human can usually look at kernel logs to determine the original name, but this is very painful if automation is constructing the parameter. For these reasons, allow selection of the egress interface via MAC address when configuring netconsole using the module parameter. Update the netconsole documentation with an example of the new syntax. Selection of egress interface by MAC address via configfs is far less interesting (since when this interface can be used, one should be able to easily convert between MAC address and interface name), so it is left unimplemented. Signed-off-by: Uday Shankar Reviewed-by: Breno Leitao Tested-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250312-netconsole-v6-2-3437933e79b8@purestorage.com Signed-off-by: Paolo Abeni --- include/linux/netpoll.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index f6e8abe0b1f1..0477208ed9ff 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -25,7 +25,13 @@ union inet_addr { struct netpoll { struct net_device *dev; netdevice_tracker dev_tracker; + /* + * Either dev_name or dev_mac can be used to specify the local + * interface - dev_name is used if it is a nonempty string, else + * dev_mac is used. + */ char dev_name[IFNAMSIZ]; + u8 dev_mac[ETH_ALEN]; const char *name; union inet_addr local_ip, remote_ip; -- cgit v1.2.3 From dd5bdaf2b72da81d57f4f99e518af80002b6562e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 17 Mar 2025 11:42:54 +0100 Subject: sched/debug: Make CONFIG_SCHED_DEBUG functionality unconditional All the big Linux distros enable CONFIG_SCHED_DEBUG, because the various features it provides help not just with kernel development, but with system administration and user-space software development as well. Reflect this reality and enable this functionality unconditionally. Signed-off-by: Ingo Molnar Tested-by: Shrikanth Hegde Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250317104257.3496611-4-mingo@kernel.org --- include/linux/energy_model.h | 2 -- include/linux/sched/debug.h | 2 -- include/linux/sched/topology.h | 4 ---- 3 files changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 78318d49276d..65efc0f5ea2e 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -240,9 +240,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, struct em_perf_state *ps; int i; -#ifdef CONFIG_SCHED_DEBUG WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); -#endif if (!sum_util) return 0; diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index b5035afa2396..35ed4577a6cc 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -35,12 +35,10 @@ extern void show_stack(struct task_struct *task, unsigned long *sp, extern void sched_show_task(struct task_struct *p); -#ifdef CONFIG_SCHED_DEBUG struct seq_file; extern void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); -#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __section(".sched.text") diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 51f7b8169515..7b4301b7235f 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -25,16 +25,12 @@ enum { }; #undef SD_FLAG -#ifdef CONFIG_SCHED_DEBUG - struct sd_flag_debug { unsigned int meta_flags; char *name; }; extern const struct sd_flag_debug sd_flag_debug[]; -#endif - #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { -- cgit v1.2.3 From 45456e38c44eda2f1285601398fd289b3cec7002 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Wed, 12 Mar 2025 21:30:06 +0100 Subject: net: phy: Allow loopback speed selection for PHY drivers PHY drivers support loopback mode, but it is not possible to select the speed of the loopback mode. The speed is chosen by the set_loopback() operation of the PHY driver. Same is valid for genphy_loopback(). There are PHYs that support loopback with different speeds. Extend set_loopback() to make loopback speed selection possible. Signed-off-by: Gerhard Engleder Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250312203010.47429-2-gerhard@engleder-embedded.com Signed-off-by: Paolo Abeni --- include/linux/phy.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index c24e1a565819..1c05158e9438 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1136,8 +1136,16 @@ struct phy_driver { int (*set_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, const void *data); - /** @set_loopback: Set the loopback mood of the PHY */ - int (*set_loopback)(struct phy_device *dev, bool enable); + /** + * @set_loopback: Set the loopback mode of the PHY + * enable selects if the loopback mode is enabled or disabled. If the + * loopback mode is enabled, then the speed of the loopback mode can be + * requested with the speed argument. If the speed argument is zero, + * then any speed can be selected. If the speed argument is > 0, then + * this speed shall be selected for the loopback mode or EOPNOTSUPP + * shall be returned if speed selection is not supported. + */ + int (*set_loopback)(struct phy_device *dev, bool enable, int speed); /** @get_sqi: Get the signal quality indication */ int (*get_sqi)(struct phy_device *dev); /** @get_sqi_max: Get the maximum signal quality indication */ @@ -1915,7 +1923,7 @@ int genphy_read_status(struct phy_device *phydev); int genphy_read_master_slave(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); -int genphy_loopback(struct phy_device *phydev, bool enable); +int genphy_loopback(struct phy_device *phydev, bool enable, int speed); int genphy_soft_reset(struct phy_device *phydev); irqreturn_t genphy_handle_interrupt_no_ack(struct phy_device *phydev); @@ -1957,7 +1965,7 @@ int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); int genphy_c45_baset1_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); -int genphy_c45_loopback(struct phy_device *phydev, bool enable); +int genphy_c45_loopback(struct phy_device *phydev, bool enable, int speed); int genphy_c45_pma_resume(struct phy_device *phydev); int genphy_c45_pma_suspend(struct phy_device *phydev); int genphy_c45_fast_retrain(struct phy_device *phydev, bool enable); -- cgit v1.2.3 From 0d60fd50328a96a901b09ed653704ce7f41d15ce Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Wed, 12 Mar 2025 21:30:07 +0100 Subject: net: phy: Support speed selection for PHY loopback phy_loopback() leaves it to the PHY driver to select the speed of the loopback mode. Thus, the speed of the loopback mode depends on the PHY driver in use. Add support for speed selection to phy_loopback() to enable loopback with defined speeds. Ensure that link up is signaled if speed changes as speed is not allowed to change during link up. Link down and up is necessary for a new speed. Signed-off-by: Gerhard Engleder Link: https://patch.msgid.link/20250312203010.47429-3-gerhard@engleder-embedded.com Signed-off-by: Paolo Abeni --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 1c05158e9438..60d3b8860ea2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1808,7 +1808,7 @@ int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); -int phy_loopback(struct phy_device *phydev, bool enable); +int phy_loopback(struct phy_device *phydev, bool enable, int speed); int phy_sfp_connect_phy(void *upstream, struct phy_device *phy); void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); -- cgit v1.2.3 From 5370b43e4bcf60049bfd44db83ba8d2ec43d0152 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 19 Mar 2025 22:58:01 +0100 Subject: fs: reduce work in fdget_pos() 1. predict the file was found 2. explicitly compare the ref to "one", ignoring the dead zone The latter arguably improves the behavior to begin with. Suppose the count turned bad -- the previously used ref routine is going to check for it and return 0, indicating the count does not necessitate taking ->f_pos_lock. But there very well may be several users. i.e. not paying for special-casing the dead zone improves semantics. While here spell out each condition in a dedicated if statement. This has no effect on generated code. Sizes are as follows (in bytes; gcc 13, x86-64): stock: 321 likely(): 298 likely()+ref: 280 Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250319215801.1870660-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/file_ref.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h index 6ef92d765a66..7db62fbc0500 100644 --- a/include/linux/file_ref.h +++ b/include/linux/file_ref.h @@ -208,4 +208,18 @@ static inline unsigned long file_ref_read(file_ref_t *ref) return c >= FILE_REF_RELEASED ? 0 : c + 1; } +/* + * __file_ref_read_raw - Return the value stored in ref->refcnt + * @ref: Pointer to the reference count + * + * Return: The raw value found in the counter + * + * A hack for file_needs_f_pos_lock(), you probably want to use + * file_ref_read() instead. + */ +static inline unsigned long __file_ref_read_raw(file_ref_t *ref) +{ + return atomic_long_read(&ref->refcnt); +} + #endif -- cgit v1.2.3 From 3785c7dbae0f733f13f8857beaaada5d7dc63e02 Mon Sep 17 00:00:00 2001 From: Yujun Dong Date: Mon, 30 Dec 2024 22:16:24 +0800 Subject: cpuidle, sched: Use smp_mb__after_atomic() in current_clr_polling() In architectures that use the polling bit, current_clr_polling() employs smp_mb() to ensure that the clearing of the polling bit is visible to other cores before checking TIF_NEED_RESCHED. However, smp_mb() can be costly. Given that clear_bit() is an atomic operation, replacing smp_mb() with smp_mb__after_atomic() is appropriate. Many architectures implement smp_mb__after_atomic() as a lighter-weight barrier compared to smp_mb(), leading to performance improvements. For instance, on x86, smp_mb__after_atomic() is a no-op. This change eliminates a smp_mb() instruction in the cpuidle wake-up path, saving several CPU cycles and thereby reducing wake-up latency. Architectures that do not use the polling bit will retain the original smp_mb() behavior to ensure that existing dependencies remain unaffected. Signed-off-by: Yujun Dong Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241230141624.155356-1-yujundong@pascal-lab.net --- include/linux/sched/idle.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index e670ac282333..439f6029d3b9 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -79,6 +79,21 @@ static __always_inline bool __must_check current_clr_polling_and_test(void) return unlikely(tif_need_resched()); } +static __always_inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb__after_atomic(); /* paired with resched_curr() */ + + preempt_fold_need_resched(); +} + #else static inline void __current_set_polling(void) { } static inline void __current_clr_polling(void) { } @@ -91,21 +106,15 @@ static inline bool __must_check current_clr_polling_and_test(void) { return unlikely(tif_need_resched()); } -#endif static __always_inline void current_clr_polling(void) { __current_clr_polling(); - /* - * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. - * Once the bit is cleared, we'll get IPIs with every new - * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also - * fold. - */ smp_mb(); /* paired with resched_curr() */ preempt_fold_need_resched(); } +#endif #endif /* _LINUX_SCHED_IDLE_H */ -- cgit v1.2.3 From fd9582998b9a82ef53a34756de8a1fb86abac972 Mon Sep 17 00:00:00 2001 From: Ernest Van Hoecke Date: Wed, 19 Mar 2025 15:20:55 +0100 Subject: of: Add of_property_read_u16_index There is an of_property_read_u32_index and of_property_read_u64_index. This patch adds a similar helper for u16. Signed-off-by: Ernest Van Hoecke Signed-off-by: Francesco Dolcini Reviewed-by: Rob Herring (Arm) Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20250319142059.46692-2-francesco@dolcini.it Signed-off-by: Mark Brown --- include/linux/of.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index eaf0e2a2b75c..5e52d90f6408 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -314,6 +314,9 @@ extern struct property *of_find_property(const struct device_node *np, extern bool of_property_read_bool(const struct device_node *np, const char *propname); extern int of_property_count_elems_of_size(const struct device_node *np, const char *propname, int elem_size); +extern int of_property_read_u16_index(const struct device_node *np, + const char *propname, + u32 index, u16 *out_value); extern int of_property_read_u32_index(const struct device_node *np, const char *propname, u32 index, u32 *out_value); @@ -627,6 +630,12 @@ static inline int of_property_count_elems_of_size(const struct device_node *np, return -ENOSYS; } +static inline int of_property_read_u16_index(const struct device_node *np, + const char *propname, u32 index, u16 *out_value) +{ + return -ENOSYS; +} + static inline int of_property_read_u32_index(const struct device_node *np, const char *propname, u32 index, u32 *out_value) { -- cgit v1.2.3 From 370a6de7651b9745b997c32f90685f9e100ccfcd Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 20 Mar 2025 12:02:50 +0000 Subject: iomap: rework IOMAP atomic flags Flag IOMAP_ATOMIC_SW is not really required. The idea of having this flag is that the FS ->iomap_begin callback could check if this flag is set to decide whether to do a SW (FS-based) atomic write. But the FS can set which ->iomap_begin callback it wants when deciding to do a FS-based atomic write. Furthermore, it was thought that IOMAP_ATOMIC_HW is not a proper name, as the block driver can use SW-methods to emulate an atomic write. So change back to IOMAP_ATOMIC. The ->iomap_begin callback needs though to indicate to iomap core that REQ_ATOMIC needs to be set, so add IOMAP_F_ATOMIC_BIO for that. These changes were suggested by Christoph Hellwig and Dave Chinner. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250320120250.4087011-4-john.g.garry@oracle.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 9cd93530013c..02fe001feebb 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -60,6 +60,9 @@ struct vm_fault; * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block * assigned to it yet and the file system will do that in the bio submission * handler, splitting the I/O as needed. + * + * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic + * bio, i.e. set REQ_ATOMIC. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -73,6 +76,7 @@ struct vm_fault; #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) #define IOMAP_F_ANON_WRITE (1U << 7) +#define IOMAP_F_ATOMIC_BIO (1U << 8) /* * Flags set by the core iomap code during operations: @@ -189,9 +193,8 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */ +#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */ #define IOMAP_DONTCACHE (1 << 10) -#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */ struct iomap_ops { /* @@ -503,11 +506,6 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) -/* - * Use software-based torn-write protection. - */ -#define IOMAP_DIO_ATOMIC_SW (1 << 3) - ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); -- cgit v1.2.3 From a72f418703b55072d837b72ac87091e18049260c Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:16 +0100 Subject: tty: convert "TTY Struct Flags" to an enum Convert TTY_* macros (flags) to an enum. This allows for easier kernel-doc (the comment needed fine tuning), grouping of these nicely, and proper checking. Note that these are bit positions. So they are used such as test_bit(TTY_THROTTLED, ...). Given these are not the user API (only in-kernel API/ABI), the bit positions are NOT preserved in this patch. All are renumbered naturally using the enum-auto-numbering. Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-2-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 2372f9357240..6bb4fb3845f0 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -251,7 +251,7 @@ struct tty_file_private { }; /** - * DOC: TTY Struct Flags + * enum tty_struct_flags - TTY Struct Flags * * These bits are used in the :c:member:`tty_struct.flags` field. * @@ -260,62 +260,64 @@ struct tty_file_private { * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. * - * TTY_THROTTLED + * @TTY_THROTTLED: * Driver input is throttled. The ldisc should call * :c:member:`tty_driver.unthrottle()` in order to resume reception when * it is ready to process more data (at threshold min). * - * TTY_IO_ERROR + * @TTY_IO_ERROR: * If set, causes all subsequent userspace read/write calls on the tty to * fail, returning -%EIO. (May be no ldisc too.) * - * TTY_OTHER_CLOSED + * @TTY_OTHER_CLOSED: * Device is a pty and the other side has closed. * - * TTY_EXCLUSIVE + * @TTY_EXCLUSIVE: * Exclusive open mode (a single opener). * - * TTY_DO_WRITE_WAKEUP + * @TTY_DO_WRITE_WAKEUP: * If set, causes the driver to call the * :c:member:`tty_ldisc_ops.write_wakeup()` method in order to resume * transmission when it can accept more data to transmit. * - * TTY_LDISC_OPEN + * @TTY_LDISC_OPEN: * Indicates that a line discipline is open. For debugging purposes only. * - * TTY_PTY_LOCK + * @TTY_PTY_LOCK: * A flag private to pty code to implement %TIOCSPTLCK/%TIOCGPTLCK logic. * - * TTY_NO_WRITE_SPLIT + * @TTY_NO_WRITE_SPLIT: * Prevent driver from splitting up writes into smaller chunks (preserve * write boundaries to driver). * - * TTY_HUPPED + * @TTY_HUPPED: * The TTY was hung up. This is set post :c:member:`tty_driver.hangup()`. * - * TTY_HUPPING + * @TTY_HUPPING: * The TTY is in the process of hanging up to abort potential readers. * - * TTY_LDISC_CHANGING + * @TTY_LDISC_CHANGING: * Line discipline for this TTY is being changed. I/O should not block * when this is set. Use tty_io_nonblock() to check. * - * TTY_LDISC_HALTED + * @TTY_LDISC_HALTED: * Line discipline for this TTY was stopped. No work should be queued to * this ldisc. */ -#define TTY_THROTTLED 0 -#define TTY_IO_ERROR 1 -#define TTY_OTHER_CLOSED 2 -#define TTY_EXCLUSIVE 3 -#define TTY_DO_WRITE_WAKEUP 5 -#define TTY_LDISC_OPEN 11 -#define TTY_PTY_LOCK 16 -#define TTY_NO_WRITE_SPLIT 17 -#define TTY_HUPPED 18 -#define TTY_HUPPING 19 -#define TTY_LDISC_CHANGING 20 -#define TTY_LDISC_HALTED 22 +enum tty_struct_flags { + TTY_THROTTLED, + TTY_IO_ERROR, + TTY_OTHER_CLOSED, + TTY_EXCLUSIVE, + TTY_DO_WRITE_WAKEUP, + TTY_LDISC_OPEN, + TTY_PTY_LOCK, + TTY_NO_WRITE_SPLIT, + TTY_HUPPED, + TTY_HUPPING, + TTY_LDISC_CHANGING, + TTY_LDISC_HALTED, +}; static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { -- cgit v1.2.3 From d2e38e713bada24539b2b049f9be8d40dea79f41 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:19 +0100 Subject: tty: move N_TTY_BUF_SIZE to n_tty "N_TTY_BUF_SIZE" is private to n_tty and shall not be exposed to the world. Definitely not in tty.h somewhere in the middle of "struct tty_struct". This is a remnant of moving "read_flags" to "struct n_tty_data" in commit 3fe780b379fa ("TTY: move ldisc data from tty_struct: bitmaps"). But some cleanup was needed first (in previous patches). Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-5-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 6bb4fb3845f0..0a46e4054dec 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -239,7 +239,6 @@ struct tty_struct { struct list_head tty_files; -#define N_TTY_BUF_SIZE 4096 struct work_struct SAK_work; } __randomize_layout; -- cgit v1.2.3 From 67acbcc324272ed06cd1c8f360afdeceb919af89 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:27 +0100 Subject: tty: tty_driver: move TTY macros to the top So that they can be referenced in structs once converted to enums (in the next patches). Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-13-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 156 ++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index d4cdc089f6c3..f3be6d56e9e5 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -16,6 +16,84 @@ struct tty_driver; struct serial_icounter_struct; struct serial_struct; +/** + * DOC: TTY Driver Flags + * + * TTY_DRIVER_RESET_TERMIOS + * Requests the tty layer to reset the termios setting when the last + * process has closed the device. Used for PTYs, in particular. + * + * TTY_DRIVER_REAL_RAW + * Indicates that the driver will guarantee not to set any special + * character handling flags if this is set for the tty: + * + * ``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)`` + * + * That is, if there is no reason for the driver to + * send notifications of parity and break characters up to the line + * driver, it won't do so. This allows the line driver to optimize for + * this case if this flag is set. (Note that there is also a promise, if + * the above case is true, not to signal overruns, either.) + * + * TTY_DRIVER_DYNAMIC_DEV + * The individual tty devices need to be registered with a call to + * tty_register_device() when the device is found in the system and + * unregistered with a call to tty_unregister_device() so the devices will + * be show up properly in sysfs. If not set, all &tty_driver.num entries + * will be created by the tty core in sysfs when tty_register_driver() is + * called. This is to be used by drivers that have tty devices that can + * appear and disappear while the main tty driver is registered with the + * tty core. + * + * TTY_DRIVER_DEVPTS_MEM + * Don't use the standard arrays (&tty_driver.ttys and + * &tty_driver.termios), instead use dynamic memory keyed through the + * devpts filesystem. This is only applicable to the PTY driver. + * + * TTY_DRIVER_HARDWARE_BREAK + * Hardware handles break signals. Pass the requested timeout to the + * &tty_operations.break_ctl instead of using a simple on/off interface. + * + * TTY_DRIVER_DYNAMIC_ALLOC + * Do not allocate structures which are needed per line for this driver + * (&tty_driver.ports) as it would waste memory. The driver will take + * care. This is only applicable to the PTY driver. + * + * TTY_DRIVER_UNNUMBERED_NODE + * Do not create numbered ``/dev`` nodes. For example, create + * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a + * driver for a single tty device is being allocated. + */ +#define TTY_DRIVER_INSTALLED 0x0001 +#define TTY_DRIVER_RESET_TERMIOS 0x0002 +#define TTY_DRIVER_REAL_RAW 0x0004 +#define TTY_DRIVER_DYNAMIC_DEV 0x0008 +#define TTY_DRIVER_DEVPTS_MEM 0x0010 +#define TTY_DRIVER_HARDWARE_BREAK 0x0020 +#define TTY_DRIVER_DYNAMIC_ALLOC 0x0040 +#define TTY_DRIVER_UNNUMBERED_NODE 0x0080 + +/* tty driver types */ +#define TTY_DRIVER_TYPE_SYSTEM 0x0001 +#define TTY_DRIVER_TYPE_CONSOLE 0x0002 +#define TTY_DRIVER_TYPE_SERIAL 0x0003 +#define TTY_DRIVER_TYPE_PTY 0x0004 +#define TTY_DRIVER_TYPE_SCC 0x0005 /* scc driver */ +#define TTY_DRIVER_TYPE_SYSCONS 0x0006 + +/* system subtypes (magic, used by tty_io.c) */ +#define SYSTEM_TYPE_TTY 0x0001 +#define SYSTEM_TYPE_CONSOLE 0x0002 +#define SYSTEM_TYPE_SYSCONS 0x0003 +#define SYSTEM_TYPE_SYSPTMX 0x0004 + +/* pty subtypes (magic, used by tty_io.c) */ +#define PTY_TYPE_MASTER 0x0001 +#define PTY_TYPE_SLAVE 0x0002 + +/* serial subtype definitions */ +#define SERIAL_TYPE_NORMAL 1 + /** * struct tty_operations -- interface between driver and tty * @@ -494,84 +572,6 @@ static inline void tty_set_operations(struct tty_driver *driver, driver->ops = op; } -/** - * DOC: TTY Driver Flags - * - * TTY_DRIVER_RESET_TERMIOS - * Requests the tty layer to reset the termios setting when the last - * process has closed the device. Used for PTYs, in particular. - * - * TTY_DRIVER_REAL_RAW - * Indicates that the driver will guarantee not to set any special - * character handling flags if this is set for the tty: - * - * ``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)`` - * - * That is, if there is no reason for the driver to - * send notifications of parity and break characters up to the line - * driver, it won't do so. This allows the line driver to optimize for - * this case if this flag is set. (Note that there is also a promise, if - * the above case is true, not to signal overruns, either.) - * - * TTY_DRIVER_DYNAMIC_DEV - * The individual tty devices need to be registered with a call to - * tty_register_device() when the device is found in the system and - * unregistered with a call to tty_unregister_device() so the devices will - * be show up properly in sysfs. If not set, all &tty_driver.num entries - * will be created by the tty core in sysfs when tty_register_driver() is - * called. This is to be used by drivers that have tty devices that can - * appear and disappear while the main tty driver is registered with the - * tty core. - * - * TTY_DRIVER_DEVPTS_MEM - * Don't use the standard arrays (&tty_driver.ttys and - * &tty_driver.termios), instead use dynamic memory keyed through the - * devpts filesystem. This is only applicable to the PTY driver. - * - * TTY_DRIVER_HARDWARE_BREAK - * Hardware handles break signals. Pass the requested timeout to the - * &tty_operations.break_ctl instead of using a simple on/off interface. - * - * TTY_DRIVER_DYNAMIC_ALLOC - * Do not allocate structures which are needed per line for this driver - * (&tty_driver.ports) as it would waste memory. The driver will take - * care. This is only applicable to the PTY driver. - * - * TTY_DRIVER_UNNUMBERED_NODE - * Do not create numbered ``/dev`` nodes. For example, create - * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a - * driver for a single tty device is being allocated. - */ -#define TTY_DRIVER_INSTALLED 0x0001 -#define TTY_DRIVER_RESET_TERMIOS 0x0002 -#define TTY_DRIVER_REAL_RAW 0x0004 -#define TTY_DRIVER_DYNAMIC_DEV 0x0008 -#define TTY_DRIVER_DEVPTS_MEM 0x0010 -#define TTY_DRIVER_HARDWARE_BREAK 0x0020 -#define TTY_DRIVER_DYNAMIC_ALLOC 0x0040 -#define TTY_DRIVER_UNNUMBERED_NODE 0x0080 - -/* tty driver types */ -#define TTY_DRIVER_TYPE_SYSTEM 0x0001 -#define TTY_DRIVER_TYPE_CONSOLE 0x0002 -#define TTY_DRIVER_TYPE_SERIAL 0x0003 -#define TTY_DRIVER_TYPE_PTY 0x0004 -#define TTY_DRIVER_TYPE_SCC 0x0005 /* scc driver */ -#define TTY_DRIVER_TYPE_SYSCONS 0x0006 - -/* system subtypes (magic, used by tty_io.c) */ -#define SYSTEM_TYPE_TTY 0x0001 -#define SYSTEM_TYPE_CONSOLE 0x0002 -#define SYSTEM_TYPE_SYSCONS 0x0003 -#define SYSTEM_TYPE_SYSPTMX 0x0004 - -/* pty subtypes (magic, used by tty_io.c) */ -#define PTY_TYPE_MASTER 0x0001 -#define PTY_TYPE_SLAVE 0x0002 - -/* serial subtype definitions */ -#define SERIAL_TYPE_NORMAL 1 - int tty_register_driver(struct tty_driver *driver); void tty_unregister_driver(struct tty_driver *driver); struct device *tty_register_device(struct tty_driver *driver, unsigned index, -- cgit v1.2.3 From 109e06ae1dcf41d470705c54a67b123792424279 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:28 +0100 Subject: tty: tty_driver: convert "TTY Driver Flags" to an enum Convert TTY_DRIVER_* macros (flags) to an enum. This allows for easier kernel-doc (the comment needed fine tuning), grouping of these nicely, and proper checking. Given these are flags, define them using modern BIT() instead of hex constants. It turns out (thanks, kernel-doc checker) that internal TTY_DRIVER_INSTALLED was undocumented. Fix that too. Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-14-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index f3be6d56e9e5..d0d940236580 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -17,13 +17,19 @@ struct serial_icounter_struct; struct serial_struct; /** - * DOC: TTY Driver Flags + * enum tty_driver_flag -- TTY Driver Flags * - * TTY_DRIVER_RESET_TERMIOS + * These are flags passed to tty_alloc_driver(). + * + * @TTY_DRIVER_INSTALLED: + * Whether this driver was succesfully installed. This is a tty internal + * flag. Do not touch. + * + * @TTY_DRIVER_RESET_TERMIOS: * Requests the tty layer to reset the termios setting when the last * process has closed the device. Used for PTYs, in particular. * - * TTY_DRIVER_REAL_RAW + * @TTY_DRIVER_REAL_RAW: * Indicates that the driver will guarantee not to set any special * character handling flags if this is set for the tty: * @@ -35,7 +41,7 @@ struct serial_struct; * this case if this flag is set. (Note that there is also a promise, if * the above case is true, not to signal overruns, either.) * - * TTY_DRIVER_DYNAMIC_DEV + * @TTY_DRIVER_DYNAMIC_DEV: * The individual tty devices need to be registered with a call to * tty_register_device() when the device is found in the system and * unregistered with a call to tty_unregister_device() so the devices will @@ -45,33 +51,35 @@ struct serial_struct; * appear and disappear while the main tty driver is registered with the * tty core. * - * TTY_DRIVER_DEVPTS_MEM + * @TTY_DRIVER_DEVPTS_MEM: * Don't use the standard arrays (&tty_driver.ttys and * &tty_driver.termios), instead use dynamic memory keyed through the * devpts filesystem. This is only applicable to the PTY driver. * - * TTY_DRIVER_HARDWARE_BREAK + * @TTY_DRIVER_HARDWARE_BREAK: * Hardware handles break signals. Pass the requested timeout to the * &tty_operations.break_ctl instead of using a simple on/off interface. * - * TTY_DRIVER_DYNAMIC_ALLOC + * @TTY_DRIVER_DYNAMIC_ALLOC: * Do not allocate structures which are needed per line for this driver * (&tty_driver.ports) as it would waste memory. The driver will take * care. This is only applicable to the PTY driver. * - * TTY_DRIVER_UNNUMBERED_NODE + * @TTY_DRIVER_UNNUMBERED_NODE: * Do not create numbered ``/dev`` nodes. For example, create * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a * driver for a single tty device is being allocated. */ -#define TTY_DRIVER_INSTALLED 0x0001 -#define TTY_DRIVER_RESET_TERMIOS 0x0002 -#define TTY_DRIVER_REAL_RAW 0x0004 -#define TTY_DRIVER_DYNAMIC_DEV 0x0008 -#define TTY_DRIVER_DEVPTS_MEM 0x0010 -#define TTY_DRIVER_HARDWARE_BREAK 0x0020 -#define TTY_DRIVER_DYNAMIC_ALLOC 0x0040 -#define TTY_DRIVER_UNNUMBERED_NODE 0x0080 +enum tty_driver_flag { + TTY_DRIVER_INSTALLED = BIT(0), + TTY_DRIVER_RESET_TERMIOS = BIT(1), + TTY_DRIVER_REAL_RAW = BIT(2), + TTY_DRIVER_DYNAMIC_DEV = BIT(3), + TTY_DRIVER_DEVPTS_MEM = BIT(4), + TTY_DRIVER_HARDWARE_BREAK = BIT(5), + TTY_DRIVER_DYNAMIC_ALLOC = BIT(6), + TTY_DRIVER_UNNUMBERED_NODE = BIT(7), +}; /* tty driver types */ #define TTY_DRIVER_TYPE_SYSTEM 0x0001 -- cgit v1.2.3 From 63f3cd5d80d720fc6c9fd321138bbbf322ade392 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:29 +0100 Subject: tty: tty_driver: document both {,__}tty_alloc_driver() properly __tty_alloc_driver()'s kernel-doc needed some care: describe the return value using the standard "Returns:", and use the new enum tty_driver_flag for @flags. Then, the tty_alloc_driver() macro was undocumented, but referenced many times in the docs. Copy the docs from the above (except the @owner parameter, obviously). Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-15-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index d0d940236580..0fe38befa1b8 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -564,7 +564,13 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line); void tty_driver_kref_put(struct tty_driver *driver); -/* Use TTY_DRIVER_* flags below */ +/** + * tty_alloc_driver - allocate tty driver + * @lines: count of lines this driver can handle at most + * @flags: some of enum tty_driver_flag, will be set in driver->flags + * + * Returns: struct tty_driver or a PTR-encoded error (use IS_ERR() and friends). + */ #define tty_alloc_driver(lines, flags) \ __tty_alloc_driver(lines, THIS_MODULE, flags) -- cgit v1.2.3 From 52443558adcdb11cff76beec34bf75f6779e1a08 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:30 +0100 Subject: tty: tty_driver: introduce TTY driver sub/types enums Convert TTY_DRIVER_TYPE_*, and subtype macros to two enums: tty_driver_type and tty_driver_subtype. This allows for easier kernel-doc (later), grouping of these nicely, and proper checking. The tty_driver's ::type and ::subtype now use these enums instead of bare "short". Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250317070046.24386-16-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 0fe38befa1b8..188ee9b768eb 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -81,26 +81,26 @@ enum tty_driver_flag { TTY_DRIVER_UNNUMBERED_NODE = BIT(7), }; -/* tty driver types */ -#define TTY_DRIVER_TYPE_SYSTEM 0x0001 -#define TTY_DRIVER_TYPE_CONSOLE 0x0002 -#define TTY_DRIVER_TYPE_SERIAL 0x0003 -#define TTY_DRIVER_TYPE_PTY 0x0004 -#define TTY_DRIVER_TYPE_SCC 0x0005 /* scc driver */ -#define TTY_DRIVER_TYPE_SYSCONS 0x0006 +enum tty_driver_type { + TTY_DRIVER_TYPE_SYSTEM, + TTY_DRIVER_TYPE_CONSOLE, + TTY_DRIVER_TYPE_SERIAL, + TTY_DRIVER_TYPE_PTY, + TTY_DRIVER_TYPE_SCC, + TTY_DRIVER_TYPE_SYSCONS, +}; -/* system subtypes (magic, used by tty_io.c) */ -#define SYSTEM_TYPE_TTY 0x0001 -#define SYSTEM_TYPE_CONSOLE 0x0002 -#define SYSTEM_TYPE_SYSCONS 0x0003 -#define SYSTEM_TYPE_SYSPTMX 0x0004 +enum tty_driver_subtype { + SYSTEM_TYPE_TTY = 1, + SYSTEM_TYPE_CONSOLE, + SYSTEM_TYPE_SYSCONS, + SYSTEM_TYPE_SYSPTMX, -/* pty subtypes (magic, used by tty_io.c) */ -#define PTY_TYPE_MASTER 0x0001 -#define PTY_TYPE_SLAVE 0x0002 + PTY_TYPE_MASTER = 1, + PTY_TYPE_SLAVE, -/* serial subtype definitions */ -#define SERIAL_TYPE_NORMAL 1 + SERIAL_TYPE_NORMAL = 1, +}; /** * struct tty_operations -- interface between driver and tty @@ -500,8 +500,8 @@ struct tty_operations { * @major: major /dev device number (zero for autoassignment) * @minor_start: the first minor /dev device number * @num: number of devices allocated - * @type: type of tty driver (%TTY_DRIVER_TYPE_) - * @subtype: subtype of tty driver (%SYSTEM_TYPE_, %PTY_TYPE_, %SERIAL_TYPE_) + * @type: type of tty driver (enum tty_driver_type) + * @subtype: subtype of tty driver (enum tty_driver_subtype) * @init_termios: termios to set to each tty initially (e.g. %tty_std_termios) * @flags: tty driver flags (%TTY_DRIVER_) * @proc_entry: proc fs entry, used internally @@ -533,8 +533,8 @@ struct tty_driver { int major; int minor_start; unsigned int num; - short type; - short subtype; + enum tty_driver_type type; + enum tty_driver_subtype subtype; struct ktermios init_termios; unsigned long flags; struct proc_dir_entry *proc_entry; -- cgit v1.2.3 From 5af89030960fd987a6c25e33405bbaaff3c7298c Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 17 Mar 2025 08:00:31 +0100 Subject: tty: serdev: drop serdev_controller_ops::write_room() In particular, serdev_device_write_room() is not called, so the whole serdev's write_room() can go. Signed-off-by: Jiri Slaby (SUSE) Cc: Rob Herring Link: https://lore.kernel.org/r/20250317070046.24386-17-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index ff78efc1f60d..34562eb99931 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -84,7 +84,6 @@ enum serdev_parity { struct serdev_controller_ops { ssize_t (*write_buf)(struct serdev_controller *, const u8 *, size_t); void (*write_flush)(struct serdev_controller *); - int (*write_room)(struct serdev_controller *); int (*open)(struct serdev_controller *); void (*close)(struct serdev_controller *); void (*set_flow_control)(struct serdev_controller *, bool); @@ -212,7 +211,6 @@ int serdev_device_break_ctl(struct serdev_device *serdev, int break_state); void serdev_device_write_wakeup(struct serdev_device *); ssize_t serdev_device_write(struct serdev_device *, const u8 *, size_t, long); void serdev_device_write_flush(struct serdev_device *); -int serdev_device_write_room(struct serdev_device *); /* * serdev device driver functions @@ -273,10 +271,6 @@ static inline ssize_t serdev_device_write(struct serdev_device *sdev, return -ENODEV; } static inline void serdev_device_write_flush(struct serdev_device *sdev) {} -static inline int serdev_device_write_room(struct serdev_device *sdev) -{ - return 0; -} #define serdev_device_driver_register(x) #define serdev_device_driver_unregister(x) -- cgit v1.2.3 From 093c8812de2d3436744fd10edab9bf816b94f833 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 19 Mar 2025 21:00:13 +0000 Subject: cgroup: rstat: Cleanup flushing functions and locking Now that the rstat lock is being re-acquired on every CPU iteration in cgroup_rstat_flush_locked(), having the initially acquire the lock is unnecessary and unclear. Inline cgroup_rstat_flush_locked() into cgroup_rstat_flush() and move the lock/unlock calls to the beginning and ending of the loop body to make the critical section obvious. cgroup_rstat_flush_hold/release() do not make much sense with the lock being dropped and reacquired internally. Since it has no external callers, remove it and explicitly acquire the lock in cgroup_base_stat_cputime_show() instead. This leaves the code with a single flushing function, cgroup_rstat_flush(). Signed-off-by: Yosry Ahmed Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8e7415c64ed1..28e999f2c642 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -690,8 +690,6 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); -void cgroup_rstat_flush_hold(struct cgroup *cgrp); -void cgroup_rstat_flush_release(struct cgroup *cgrp); /* * Basic resource stats. -- cgit v1.2.3 From fe14262695526145334bfa0bb51fcc365cf6dfb5 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 11 Mar 2025 10:16:34 +0100 Subject: hyperv: Remove unused union and structs The union vmpacket_largest_possible_header and several structs have not been used for a long time afaict - remove them. Reviewed-by: Michael Kelley Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20250311091634.494888-2-thorsten.blum@linux.dev Signed-off-by: Wei Liu Message-ID: <20250311091634.494888-2-thorsten.blum@linux.dev> --- include/linux/hyperv.h | 56 -------------------------------------------------- 1 file changed, 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 7f4f8d8bdf43..675959fb97ba 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -371,19 +371,6 @@ struct vmtransfer_page_packet_header { struct vmtransfer_page_range ranges[]; } __packed; -struct vmgpadl_packet_header { - struct vmpacket_descriptor d; - u32 gpadl; - u32 reserved; -} __packed; - -struct vmadd_remove_transfer_page_set { - struct vmpacket_descriptor d; - u32 gpadl; - u16 xfer_pageset_id; - u16 reserved; -} __packed; - /* * This structure defines a range in guest physical space that can be made to * look virtually contiguous. @@ -394,30 +381,6 @@ struct gpa_range { u64 pfn_array[]; }; -/* - * This is the format for an Establish Gpadl packet, which contains a handle by - * which this GPADL will be known and a set of GPA ranges associated with it. - * This can be converted to a MDL by the guest OS. If there are multiple GPA - * ranges, then the resulting MDL will be "chained," representing multiple VA - * ranges. - */ -struct vmestablish_gpadl { - struct vmpacket_descriptor d; - u32 gpadl; - u32 range_cnt; - struct gpa_range range[1]; -} __packed; - -/* - * This is the format for a Teardown Gpadl packet, which indicates that the - * GPADL handle in the Establish Gpadl packet will never be referenced again. - */ -struct vmteardown_gpadl { - struct vmpacket_descriptor d; - u32 gpadl; - u32 reserved; /* for alignment to a 8-byte boundary */ -} __packed; - /* * This is the format for a GPA-Direct packet, which contains a set of GPA * ranges, in addition to commands and/or data. @@ -429,25 +392,6 @@ struct vmdata_gpa_direct { struct gpa_range range[1]; } __packed; -/* This is the format for a Additional Data Packet. */ -struct vmadditional_data { - struct vmpacket_descriptor d; - u64 total_bytes; - u32 offset; - u32 byte_cnt; - unsigned char data[1]; -} __packed; - -union vmpacket_largest_possible_header { - struct vmpacket_descriptor simple_hdr; - struct vmtransfer_page_packet_header xfer_page_hdr; - struct vmgpadl_packet_header gpadl_hdr; - struct vmadd_remove_transfer_page_set add_rm_xfer_page_hdr; - struct vmestablish_gpadl establish_gpadl_hdr; - struct vmteardown_gpadl teardown_gpadl_hdr; - struct vmdata_gpa_direct data_gpa_direct_hdr; -}; - #define VMPACKET_DATA_START_ADDRESS(__packet) \ (void *)(((unsigned char *)__packet) + \ ((struct vmpacket_descriptor)__packet)->offset8 * 8) -- cgit v1.2.3 From 1cf8e152e8c909c2d6c610b35278a7480af7a156 Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Wed, 19 Mar 2025 12:12:18 -0400 Subject: cpumask: align text in comment Since commit 4e1a7df45480 ("cpumask: Add enabled cpumask for present CPUs that can be brought online") introduced cpu_enabled_mask, the comment line describing the mask has been slightly out of alignment with the adjacent lines. Fix this by removing a single space character. Signed-off-by: Joel Savitz Signed-off-by: Yury Norov --- include/linux/cpumask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 9289d4612170..f9a868384083 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -81,7 +81,7 @@ static __always_inline void set_nr_cpu_ids(unsigned int nr) * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated - * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online + * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * -- cgit v1.2.3 From 95c4e6d42c99237c0264d1a401a6223849477cb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 11 Mar 2025 19:46:58 +0200 Subject: PCI: Move pci_rescan_bus_bridge_resize() declaration to pci/pci.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pci_rescan_bus_bridge_resize() is only used by code inside PCI subsystem. The comment also falsely advertises it to be for hotplug drivers, yet the only caller is from sysfs store function. Move the function declaration into pci/pci.h. Link: https://lore.kernel.org/r/20250311174701.3586-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 47b31ad724fa..d788acf2686a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1455,7 +1455,6 @@ void set_pcie_port_type(struct pci_dev *pdev); void set_pcie_hotplug_bridge(struct pci_dev *pdev); /* Functions for PCI Hotplug drivers to use */ -unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge); unsigned int pci_rescan_bus(struct pci_bus *bus); void pci_lock_rescan_remove(void); void pci_unlock_rescan_remove(void); -- cgit v1.2.3 From 7d4bcc0f2631e4ee10b5bcfff24a423d1c3c02a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 11 Mar 2025 19:46:59 +0200 Subject: PCI: Move resource reassignment func declarations into pci/pci.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Neither pci_reassign_bridge_resources() nor pci_reassign_resource() is used outside of the PCI subsystem. They seem to be naturally static functions but since resource fitting/assignment is split between setup-bus.c and setup-res.c, they fall into different sides of the divide and need to be declared. Move the declarations of pci_reassign_bridge_resources() and pci_reassign_resource() into pci/pci.h to keep them internal to PCI subsystem. Link: https://lore.kernel.org/r/20250311174701.3586-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index d788acf2686a..c629962f4ccd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1396,7 +1396,6 @@ void pci_reset_secondary_bus(struct pci_dev *dev); void pcibios_reset_secondary_bus(struct pci_dev *dev); void pci_update_resource(struct pci_dev *dev, int resno); int __must_check pci_assign_resource(struct pci_dev *dev, int i); -int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align); void pci_release_resource(struct pci_dev *dev, int resno); static inline int pci_rebar_bytes_to_size(u64 bytes) { @@ -1476,7 +1475,6 @@ void pci_assign_unassigned_resources(void); void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge); void pci_assign_unassigned_bus_resources(struct pci_bus *bus); void pci_assign_unassigned_root_bus_resources(struct pci_bus *bus); -int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type); int pci_enable_resources(struct pci_dev *, int mask); void pci_assign_irq(struct pci_dev *dev); struct resource *pci_find_resource(struct pci_dev *dev, struct resource *res); -- cgit v1.2.3 From 2f255e299c676cbae4b0d164a497a0553b86845a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 11 Mar 2025 19:47:00 +0200 Subject: PCI: Make pci_setup_bridge() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pci_setup_bridge() is only used within setup-bus.c. Therefore, make it a static function. Link: https://lore.kernel.org/r/20250311174701.3586-3-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index c629962f4ccd..9a703355ef06 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1634,7 +1634,6 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata); int pci_cfg_space_size(struct pci_dev *dev); unsigned char pci_bus_max_busnr(struct pci_bus *bus); -void pci_setup_bridge(struct pci_bus *bus); resource_size_t pcibios_window_alignment(struct pci_bus *bus, unsigned long type); -- cgit v1.2.3 From cc7a371b0bf5e507b24c5a595068dfb4e2b3445b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 11 Mar 2025 19:47:01 +0200 Subject: PCI: Move cardbus IO size declarations into pci/pci.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some reason, cardbus related io/mem size declarations are in linux/pci.h, whereas non-cardbus sizes are already in pci/pci.h. Move all them into one place in pci/pci.h. Link: https://lore.kernel.org/r/20250311174701.3586-4-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 9a703355ef06..f9424478a19a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2332,8 +2332,6 @@ extern int pci_pci_problems; #define PCIPCI_ALIMAGIK 32 /* Need low latency setting */ #define PCIAGP_FAIL 64 /* No PCI to AGP DMA */ -extern unsigned long pci_cardbus_io_size; -extern unsigned long pci_cardbus_mem_size; extern u8 pci_dfl_cache_line_size; extern u8 pci_cache_line_size; -- cgit v1.2.3 From 7e9dd0d1e9c50cedef403d4bef6a2c1dc22ac79d Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 20 Mar 2025 12:44:09 -0700 Subject: pds_core: add new fwctl auxiliary_device Add support for a new fwctl-based auxiliary_device for creating a channel for fwctl support into the AMD/Pensando DSC. Link: https://patch.msgid.link/r/20250320194412.67983-4-shannon.nelson@amd.com Reviewed-by: Leon Romanovsky Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Shannon Nelson Signed-off-by: Jason Gunthorpe --- include/linux/pds/pds_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h index 5802e1deef24..b193adbe7cc3 100644 --- a/include/linux/pds/pds_common.h +++ b/include/linux/pds/pds_common.h @@ -29,6 +29,7 @@ enum pds_core_vif_types { PDS_DEV_TYPE_ETH = 3, PDS_DEV_TYPE_RDMA = 4, PDS_DEV_TYPE_LM = 5, + PDS_DEV_TYPE_FWCTL = 6, /* new ones added before this line */ PDS_DEV_TYPE_MAX = 16 /* don't change - used in struct size */ @@ -40,6 +41,7 @@ enum pds_core_vif_types { #define PDS_DEV_TYPE_ETH_STR "Eth" #define PDS_DEV_TYPE_RDMA_STR "RDMA" #define PDS_DEV_TYPE_LM_STR "LM" +#define PDS_DEV_TYPE_FWCTL_STR "fwctl" #define PDS_VDPA_DEV_NAME PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_VDPA_STR #define PDS_VFIO_LM_DEV_NAME PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_LM_STR "." PDS_DEV_TYPE_VFIO_STR -- cgit v1.2.3 From 5c12a9cdb5ad54621f1b7c02df7993a4a1b86a46 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:10 +0100 Subject: nvme: add nvme_auth_generate_psk() Add a function to generate a NVMe PSK from the shared credentials negotiated by DH-HMAC-CHAP. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index c1d0bc5d9624..b13884b04dfd 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -40,5 +40,8 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, u8 *ctrl_key, size_t ctrl_key_len, u8 *sess_key, size_t sess_key_len); +int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *c1, u8 *c2, size_t hash_len, + u8 **ret_psk, size_t *ret_len); #endif /* _NVME_AUTH_H */ -- cgit v1.2.3 From 71972b9ffe1efe183a87d76d094236f9ec30656e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:11 +0100 Subject: nvme: add nvme_auth_generate_digest() Add a function to calculate the PSK digest as specified in TP8018. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index b13884b04dfd..998f06bf10fd 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -43,5 +43,7 @@ int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); +int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, + char *subsysnqn, char *hostnqn, u8 **ret_digest); #endif /* _NVME_AUTH_H */ -- cgit v1.2.3 From 9d5c0fffee266f61ccc745faa6298dafe2b8c5bf Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:12 +0100 Subject: nvme: add nvme_auth_derive_tls_psk() Add a function to derive the TLS PSK as specified TP8018. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- include/linux/nvme-auth.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 998f06bf10fd..60e069a6757f 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -45,5 +45,7 @@ int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, u8 **ret_psk, size_t *ret_len); int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, char *subsysnqn, char *hostnqn, u8 **ret_digest); +int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, + u8 *psk_digest, u8 **ret_psk); #endif /* _NVME_AUTH_H */ -- cgit v1.2.3 From 62eb89323cb08f1d6a3b41b84972ff4f373a1960 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:13 +0100 Subject: nvme-keyring: add nvme_tls_psk_refresh() Add a function to refresh a generated PSK in the specified keyring. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- include/linux/nvme-keyring.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h index 19d2b256180f..ab8971afa973 100644 --- a/include/linux/nvme-keyring.h +++ b/include/linux/nvme-keyring.h @@ -6,15 +6,25 @@ #ifndef _NVME_KEYRING_H #define _NVME_KEYRING_H +#include + #if IS_ENABLED(CONFIG_NVME_KEYRING) +struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, const char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest); key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn); key_serial_t nvme_keyring_id(void); struct key *nvme_tls_key_lookup(key_serial_t key_id); #else - +static inline struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest) +{ + return ERR_PTR(-ENOTSUPP); +} static inline key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn) { -- cgit v1.2.3 From e88a7595b57f2a04f1be796419444b4a14a55d18 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:14 +0100 Subject: nvme-tcp: request secure channel concatenation Add a fabrics option 'concat' to request secure channel concatenation as specified the NVME Base Specification v2.1, section 8.3.4.3: Secure Channel Concatenation. When secure channel concatenation is enabled a 'generated PSK' is inserted into the keyring such that it's available after reset. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- include/linux/nvme.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fe3b60818fdc..bfb5688363b0 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1746,6 +1746,13 @@ enum { NVME_AUTH_DHGROUP_INVALID = 0xff, }; +enum { + NVME_AUTH_SECP_NOSC = 0x00, + NVME_AUTH_SECP_SC = 0x01, + NVME_AUTH_SECP_NEWTLSPSK = 0x02, + NVME_AUTH_SECP_REPLACETLSPSK = 0x03, +}; + union nvmf_auth_protocol { struct nvmf_auth_dhchap_protocol_descriptor dhchap; }; -- cgit v1.2.3 From 51d65049cd7e22a4d9ab8f2acb018a147f7f5146 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Wed, 19 Mar 2025 14:53:48 -0700 Subject: bpf: Add struct_ops context information to struct bpf_prog_aux This patch adds struct_ops context information to struct bpf_prog_aux. This context information will be used in the kfunc filter. Currently the added context information includes struct_ops member offset and a pointer to struct bpf_struct_ops. Signed-off-by: Juntong Deng Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Link: https://patch.msgid.link/20250319215358.2287371-2-ameryhung@gmail.com --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 973a88d9b52b..111bea4e507f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1521,6 +1521,7 @@ struct bpf_prog_aux { u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + u32 attach_st_ops_member_off; u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; @@ -1566,6 +1567,7 @@ struct bpf_prog_aux { #endif struct bpf_ksym ksym; const struct bpf_prog_ops *ops; + const struct bpf_struct_ops *st_ops; struct bpf_map **used_maps; struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ struct btf_mod_pair *used_btfs; -- cgit v1.2.3 From c3164d2e0d181027da8fc94f8179d8607c3d440f Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Wed, 19 Feb 2025 10:20:57 +0100 Subject: PCI/MSI: Convert pci_msi_ignore_mask to per MSI domain flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setting pci_msi_ignore_mask inhibits the toggling of the mask bit for both MSI and MSI-X entries globally, regardless of the IRQ chip they are using. Only Xen sets the pci_msi_ignore_mask when routing physical interrupts over event channels, to prevent PCI code from attempting to toggle the maskbit, as it's Xen that controls the bit. However, the pci_msi_ignore_mask being global will affect devices that use MSI interrupts but are not routing those interrupts over event channels (not using the Xen pIRQ chip). One example is devices behind a VMD PCI bridge. In that scenario the VMD bridge configures MSI(-X) using the normal IRQ chip (the pIRQ one in the Xen case), and devices behind the bridge configure the MSI entries using indexes into the VMD bridge MSI table. The VMD bridge then demultiplexes such interrupts and delivers to the destination device(s). Having pci_msi_ignore_mask set in that scenario prevents (un)masking of MSI entries for devices behind the VMD bridge. Move the signaling of no entry masking into the MSI domain flags, as that allows setting it on a per-domain basis. Set it for the Xen MSI domain that uses the pIRQ chip, while leaving it unset for the rest of the cases. Remove pci_msi_ignore_mask at once, since it was only used by Xen code, and with Xen dropping usage the variable is unneeded. This fixes using devices behind a VMD bridge on Xen PV hardware domains. Albeit Devices behind a VMD bridge are not known to Xen, that doesn't mean Linux cannot use them. By inhibiting the usage of VMD_FEAT_CAN_BYPASS_MSI_REMAP and the removal of the pci_msi_ignore_mask bodge devices behind a VMD bridge do work fine when use from a Linux Xen hardware domain. That's the whole point of the series. Signed-off-by: Roger Pau Monné Reviewed-by: Thomas Gleixner Acked-by: Juergen Gross Acked-by: Bjorn Helgaas Message-ID: <20250219092059.90850-4-roger.pau@citrix.com> Signed-off-by: Juergen Gross --- include/linux/msi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index b10093c4d00e..59a421fc42bf 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -73,7 +73,6 @@ struct msi_msg { }; }; -extern int pci_msi_ignore_mask; /* Helper functions */ struct msi_desc; struct pci_dev; @@ -556,6 +555,8 @@ enum { MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20), /* PCI MSIs cannot be steered separately to CPU cores */ MSI_FLAG_NO_AFFINITY = (1 << 21), + /* Inhibit usage of entry masking */ + MSI_FLAG_NO_MASK = (1 << 22), }; /** -- cgit v1.2.3 From c25951eb7518844fcb7fc9ec58e888731e8c46d0 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 15 Nov 2024 15:20:55 +0000 Subject: bus: fsl-mc: Remove deadcode fsl_mc_allocator_driver_exit() was added explicitly by commit 1e8ac83b6caf ("bus: fsl-mc: add fsl_mc_allocator cleanup function") but was never used. Remove it. fsl_mc_portal_reset() was added in 2015 by commit 197f4d6a4a00 ("staging: fsl-mc: fsl-mc object allocator driver") but was never used. Remove it. fsl_mc_portal_reset() was the only caller of dpmcp_reset(). Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Ioana Ciornei Acked-by: Christophe Leroy Link: https://lore.kernel.org/r/20241115152055.279732-1-linux@treblig.org Signed-off-by: Christophe Leroy --- include/linux/fsl/mc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 99f30c7d6208..897d6211c163 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -417,8 +417,6 @@ int __must_check fsl_mc_portal_allocate(struct fsl_mc_device *mc_dev, void fsl_mc_portal_free(struct fsl_mc_io *mc_io); -int fsl_mc_portal_reset(struct fsl_mc_io *mc_io); - int __must_check fsl_mc_object_allocate(struct fsl_mc_device *mc_dev, enum fsl_mc_pool_type pool_type, struct fsl_mc_device **new_mc_adev); -- cgit v1.2.3 From fc8d5bba61ad8087af9a56337a7a297af6b46129 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 13 Mar 2025 13:14:53 +0800 Subject: lib/scatterlist: Add SG_MITER_LOCAL and use it Add kmap_local support to the scatterlist iterator. Use it for all the helper functions in lib/scatterlist. Signed-off-by: Herbert Xu --- include/linux/scatterlist.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index d836e7440ee8..138e2f1bd08f 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -671,6 +671,7 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) #define SG_MITER_ATOMIC (1 << 0) /* use kmap_atomic */ #define SG_MITER_TO_SG (1 << 1) /* flush back to phys on unmap */ #define SG_MITER_FROM_SG (1 << 2) /* nop */ +#define SG_MITER_LOCAL (1 << 3) /* use kmap_local */ struct sg_mapping_iter { /* the following three fields can be accessed directly */ -- cgit v1.2.3 From 5416b8a741d6d09369b973cd9d4dacb1887c24df Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 15 Mar 2025 18:30:34 +0800 Subject: crypto: acomp - Add ACOMP_REQUEST_ALLOC and acomp_request_alloc_extra Add ACOMP_REQUEST_ALLOC which is a wrapper around acomp_request_alloc that falls back to a synchronous stack reqeust if the allocation fails. Also add ACOMP_REQUEST_ON_STACK which stores the request on the stack only. The request should be freed with acomp_request_free. Finally add acomp_request_alloc_extra which gives the user extra memory to use in conjunction with the request. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 61ac11226638..ea3b95bdbde3 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -138,6 +138,7 @@ #define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS 0x00000100 #define CRYPTO_TFM_REQ_MAY_SLEEP 0x00000200 #define CRYPTO_TFM_REQ_MAY_BACKLOG 0x00000400 +#define CRYPTO_TFM_REQ_ON_STACK 0x00000800 /* * Miscellaneous stuff. -- cgit v1.2.3 From fce8b8d5986b76a4fdd062e3eec1bb6420fee6c5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 16 Mar 2025 09:21:27 +0800 Subject: crypto: remove obsolete 'comp' compression API The 'comp' compression API has been superseded by the acomp API, which is a bit more cumbersome to use, but ultimately more flexible when it comes to hardware implementations. Now that all the users and implementations have been removed, let's remove the core plumbing of the 'comp' API as well. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/linux/crypto.h | 76 +------------------------------------------------- 1 file changed, 1 insertion(+), 75 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index ea3b95bdbde3..1e3809d28abd 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -24,7 +24,6 @@ */ #define CRYPTO_ALG_TYPE_MASK 0x0000000f #define CRYPTO_ALG_TYPE_CIPHER 0x00000001 -#define CRYPTO_ALG_TYPE_COMPRESS 0x00000002 #define CRYPTO_ALG_TYPE_AEAD 0x00000003 #define CRYPTO_ALG_TYPE_LSKCIPHER 0x00000004 #define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005 @@ -246,26 +245,7 @@ struct cipher_alg { void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); }; -/** - * struct compress_alg - compression/decompression algorithm - * @coa_compress: Compress a buffer of specified length, storing the resulting - * data in the specified buffer. Return the length of the - * compressed data in dlen. - * @coa_decompress: Decompress the source buffer, storing the uncompressed - * data in the specified buffer. The length of the data is - * returned in dlen. - * - * All fields are mandatory. - */ -struct compress_alg { - int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src, - unsigned int slen, u8 *dst, unsigned int *dlen); - int (*coa_decompress)(struct crypto_tfm *tfm, const u8 *src, - unsigned int slen, u8 *dst, unsigned int *dlen); -}; - #define cra_cipher cra_u.cipher -#define cra_compress cra_u.compress /** * struct crypto_alg - definition of a cryptograpic cipher algorithm @@ -316,7 +296,7 @@ struct compress_alg { * transformation types. There are multiple options, such as * &crypto_skcipher_type, &crypto_ahash_type, &crypto_rng_type. * This field might be empty. In that case, there are no common - * callbacks. This is the case for: cipher, compress, shash. + * callbacks. This is the case for: cipher. * @cra_u: Callbacks implementing the transformation. This is a union of * multiple structures. Depending on the type of transformation selected * by @cra_type and @cra_flags above, the associated structure must be @@ -335,8 +315,6 @@ struct compress_alg { * @cra_init. * @cra_u.cipher: Union member which contains a single-block symmetric cipher * definition. See @struct @cipher_alg. - * @cra_u.compress: Union member which contains a (de)compression algorithm. - * See @struct @compress_alg. * @cra_module: Owner of this transformation implementation. Set to THIS_MODULE * @cra_list: internally used * @cra_users: internally used @@ -366,7 +344,6 @@ struct crypto_alg { union { struct cipher_alg cipher; - struct compress_alg compress; } cra_u; int (*cra_init)(struct crypto_tfm *tfm); @@ -440,10 +417,6 @@ struct crypto_tfm { void *__crt_ctx[] CRYPTO_MINALIGN_ATTR; }; -struct crypto_comp { - struct crypto_tfm base; -}; - /* * Transform user interface. */ @@ -500,53 +473,6 @@ static inline unsigned int crypto_tfm_ctx_alignment(void) return __alignof__(tfm->__crt_ctx); } -static inline struct crypto_comp *__crypto_comp_cast(struct crypto_tfm *tfm) -{ - return (struct crypto_comp *)tfm; -} - -static inline struct crypto_comp *crypto_alloc_comp(const char *alg_name, - u32 type, u32 mask) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_COMPRESS; - mask |= CRYPTO_ALG_TYPE_MASK; - - return __crypto_comp_cast(crypto_alloc_base(alg_name, type, mask)); -} - -static inline struct crypto_tfm *crypto_comp_tfm(struct crypto_comp *tfm) -{ - return &tfm->base; -} - -static inline void crypto_free_comp(struct crypto_comp *tfm) -{ - crypto_free_tfm(crypto_comp_tfm(tfm)); -} - -static inline int crypto_has_comp(const char *alg_name, u32 type, u32 mask) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_COMPRESS; - mask |= CRYPTO_ALG_TYPE_MASK; - - return crypto_has_alg(alg_name, type, mask); -} - -static inline const char *crypto_comp_name(struct crypto_comp *tfm) -{ - return crypto_tfm_alg_name(crypto_comp_tfm(tfm)); -} - -int crypto_comp_compress(struct crypto_comp *tfm, - const u8 *src, unsigned int slen, - u8 *dst, unsigned int *dlen); - -int crypto_comp_decompress(struct crypto_comp *tfm, - const u8 *src, unsigned int slen, - u8 *dst, unsigned int *dlen); - static inline void crypto_reqchain_init(struct crypto_async_request *req) { req->err = -EINPROGRESS; -- cgit v1.2.3 From cfe1f8776f23fd6dfe4ba9fcb695b129f0761187 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 13 Mar 2025 13:01:22 -0400 Subject: NFS: Extend rdirplus mount option with "force|none" There are certain users that wish to force the NFS client to choose READDIRPLUS over READDIR for a particular mount. Update the "rdirplus" mount option to optionally accept values. For "rdirplus=force", the NFS client will always attempt to use READDDIRPLUS. The setting of "rdirplus=none" is aliased to the existing "nordirplus". Signed-off-by: Benjamin Coddington Link: https://lore.kernel.org/r/c4cf0de4c8be0930b91bc74bee310d289781cd3b.1741885071.git.bcodding@redhat.com Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 1711eefcf24f..b83d16a42afc 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -167,6 +167,7 @@ struct nfs_server { #define NFS_MOUNT_TRUNK_DISCOVERY 0x04000000 #define NFS_MOUNT_SHUTDOWN 0x08000000 #define NFS_MOUNT_NO_ALIGNWRITE 0x10000000 +#define NFS_MOUNT_FORCE_RDIRPLUS 0x20000000 unsigned int fattr_valid; /* Valid attributes */ unsigned int caps; /* server capabilities */ -- cgit v1.2.3 From df210d9b0951d714c1668c511ca5c8ff38cf6916 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 7 Feb 2025 15:42:24 -0500 Subject: sunrpc: Add a sysfs file for adding a new xprt Writing to this file will clone the 'main' xprt of an xprt_switch and add it to be used as an additional connection. -- Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker v3: Replace call to xprt_iter_get_xprt() with xprt_iter_get_next() Link: https://lore.kernel.org/r/20250207204225.594002-5-anna@kernel.org Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtmultipath.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index e411368cdacf..e4db5022fe92 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -56,6 +56,7 @@ extern void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt); extern void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, bool offline); +extern struct rpc_xprt *rpc_xprt_switch_get_main_xprt(struct rpc_xprt_switch *xps); extern void xprt_iter_init(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps); -- cgit v1.2.3 From 487fae09d7e266a58a13784983cc1ef6a2076526 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Mar 2025 11:45:06 -0400 Subject: NFS: Add a mount option to make ENETUNREACH errors fatal If the NFS client was initially created in a container, and that container is torn down, there is usually no possibity to go back and destroy any NFS clients that are hung because their virtual network devices have been unlinked. Add a flag that tells the NFS client that in these circumstances, it should treat ENETDOWN and ENETUNREACH errors as fatal to the NFS client. The option defaults to being on when the mount happens from inside a net namespace that is not "init_net". Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Tested-by: Jeff Layton Acked-by: Chuck Lever --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index b83d16a42afc..a6ce8590eaaf 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -168,6 +168,7 @@ struct nfs_server { #define NFS_MOUNT_SHUTDOWN 0x08000000 #define NFS_MOUNT_NO_ALIGNWRITE 0x10000000 #define NFS_MOUNT_FORCE_RDIRPLUS 0x20000000 +#define NFS_MOUNT_NETUNREACH_FATAL 0x40000000 unsigned int fattr_valid; /* Valid attributes */ unsigned int caps; /* server capabilities */ -- cgit v1.2.3 From 9827144bfb2b1baf1e78600da73dcc9e92e28415 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Mar 2025 10:50:26 -0400 Subject: NFS: Treat ENETUNREACH errors as fatal in containers Propagate the NFS_MOUNT_NETUNREACH_FATAL flag to work with the generic NFS client. If the flag is set, the client will receive ENETDOWN and ENETUNREACH errors from the RPC layer, and is expected to treat them as being fatal. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Tested-by: Jeff Layton Acked-by: Chuck Lever --- include/linux/nfs_fs_sb.h | 1 + include/linux/sunrpc/clnt.h | 5 ++++- include/linux/sunrpc/sched.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index a6ce8590eaaf..71319637a84e 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -50,6 +50,7 @@ struct nfs_client { #define NFS_CS_DS 7 /* - Server is a DS */ #define NFS_CS_REUSEPORT 8 /* - reuse src port on reconnect */ #define NFS_CS_PNFS 9 /* - Server used for pnfs */ +#define NFS_CS_NETUNREACH_FATAL 10 /* - ENETUNREACH errors are fatal */ struct sockaddr_storage cl_addr; /* server identifier */ size_t cl_addrlen; char * cl_hostname; /* hostname of server */ diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index fec976e58174..f8b406b0a1af 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -64,7 +64,9 @@ struct rpc_clnt { cl_noretranstimeo: 1,/* No retransmit timeouts */ cl_autobind : 1,/* use getport() */ cl_chatty : 1,/* be verbose */ - cl_shutdown : 1;/* rpc immediate -EIO */ + cl_shutdown : 1,/* rpc immediate -EIO */ + cl_netunreach_fatal : 1; + /* Treat ENETUNREACH errors as fatal */ struct xprtsec_parms cl_xprtsec; /* transport security policy */ struct rpc_rtt * cl_rtt; /* RTO estimator data */ @@ -175,6 +177,7 @@ struct rpc_add_xprt_test { #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) #define RPC_CLNT_CREATE_CONNECTED (1UL << 12) +#define RPC_CLNT_CREATE_NETUNREACH_FATAL (1UL << 13) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index eac57914dcf3..ccba79ebf893 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -134,6 +134,7 @@ struct rpc_task_setup { #define RPC_TASK_MOVEABLE 0x0004 /* nfs4.1+ rpc tasks */ #define RPC_TASK_NULLCREDS 0x0010 /* Use AUTH_NULL credential */ #define RPC_CALL_MAJORSEEN 0x0020 /* major timeout seen */ +#define RPC_TASK_NETUNREACH_FATAL 0x0040 /* ENETUNREACH is fatal */ #define RPC_TASK_DYNAMIC 0x0080 /* task was kmalloc'ed */ #define RPC_TASK_NO_ROUND_ROBIN 0x0100 /* send requests on "main" xprt */ #define RPC_TASK_SOFT 0x0200 /* Use soft timeouts */ -- cgit v1.2.3 From 8c9f0df7a82a9f3bbdab4c796d302b20a33c1cc6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Mar 2025 08:04:35 -0400 Subject: pNFS/flexfiles: Treat ENETUNREACH errors as fatal in containers Propagate the NFS_MOUNT_NETUNREACH_FATAL flag to work with the pNFS flexfiles client. In these circumstances, the client needs to treat the ENETDOWN and ENETUNREACH errors as fatal, and should abandon the attempted I/O. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Tested-by: Jeff Layton Acked-by: Chuck Lever --- include/linux/nfs4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 5fa60fe441b5..d8cad844870a 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -300,6 +300,7 @@ enum nfsstat4 { /* error codes for internal client use */ #define NFS4ERR_RESET_TO_MDS 12001 #define NFS4ERR_RESET_TO_PNFS 12002 +#define NFS4ERR_FATAL_IOERROR 12003 static inline bool seqid_mutating_err(u32 err) { -- cgit v1.2.3 From ef490275297267d9461733ecd9b02bd3b798b3a4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 21 Mar 2025 18:04:34 +0000 Subject: io_uring/cmd: introduce io_uring_cmd_import_fixed_vec io_uring_cmd_import_fixed_vec() is a cmd helper around vectored registered buffer import functions, which caches the memory under the hood. The lifetime of the vectore and hence the iterator is bound to the request. Furthermore, the user is not allowed to call it multiple times for a single request. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/97487a80dec3fb8cf8aeedf1f9026ef6d503fe4b.1742579999.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 598cacda4aa3..e6723fa95160 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -43,6 +43,11 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, struct io_uring_cmd *ioucmd, unsigned int issue_flags); +int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, + const struct iovec __user *uvec, + size_t uvec_segs, + int ddir, struct iov_iter *iter, + unsigned issue_flags); /* * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd @@ -76,6 +81,14 @@ io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, { return -EOPNOTSUPP; } +static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, + const struct iovec __user *uvec, + size_t uvec_segs, + int ddir, struct iov_iter *iter, + unsigned issue_flags) +{ + return -EOPNOTSUPP; +} static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 ret2, unsigned issue_flags) { -- cgit v1.2.3 From 888bd8322dfc325dc5ad99184baba4e1fd91082d Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 26 Feb 2025 13:07:46 +0100 Subject: s390/pci: Introduce pdev->non_mappable_bars and replace VFIO_PCI_MMAP The ability to map PCI resources to user-space is controlled by global defines. For vfio there is VFIO_PCI_MMAP which is only disabled on s390 and controls mapping of PCI resources using vfio-pci with a fallback option via the pread()/pwrite() interface. For the PCI core there is ARCH_GENERIC_PCI_MMAP_RESOURCE which enables a generic implementation for mapping PCI resources plus the newer sysfs interface. Then there is HAVE_PCI_MMAP which can be used with custom definitions of pci_mmap_resource_range() and the historical /proc/bus/pci interface. Both mechanisms are all or nothing. For s390 mapping PCI resources is possible and useful for testing and certain applications such as QEMU's vfio-pci based user-space NVMe driver. For certain devices, however access to PCI resources via mappings to user-space is not possible and these must be excluded from the general PCI resource mapping mechanisms. Introduce pdev->non_mappable_bars to indicate that a PCI device's BARs can not be accessed via mappings to user-space. In the future this enables per-device restrictions of PCI resource mapping. For now, set this flag for all PCI devices on s390 in line with the existing, general disable of PCI resource mapping. As s390 is the only user of the VFI_PCI_MMAP Kconfig options this can already be replaced with a check of this new flag. Also add similar checks in the other code protected by HAVE_PCI_MMAP respectively ARCH_GENERIC_PCI_MMAP in preparation for enabling these for supported devices. Link: https://lore.kernel.org/lkml/20250212132808.08dcf03c.alex.williamson@redhat.com/ Link: https://lore.kernel.org/r/20250226-vfio_pci_mmap-v7-2-c5c0f1d26efd@linux.ibm.com Signed-off-by: Niklas Schnelle Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index f9424478a19a..5b5ed5ec5f0a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -476,6 +476,7 @@ struct pci_dev { unsigned int no_command_memory:1; /* No PCI_COMMAND_MEMORY */ unsigned int rom_bar_overlap:1; /* ROM BAR disable broken */ unsigned int rom_attr_enabled:1; /* Display of ROM attribute enabled? */ + unsigned int non_mappable_bars:1; /* BARs can't be mapped to user-space */ pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ -- cgit v1.2.3 From aa9f168d55dc47c0de564f7dfe0e90467c9fee71 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 26 Feb 2025 13:07:47 +0100 Subject: s390/pci: Support mmap() of PCI resources except for ISM devices So far s390 does not allow mmap() of PCI resources to user-space via the usual mechanisms, though it does use it for RDMA. For the PCI sysfs resource files and /proc/bus/pci it defines neither HAVE_PCI_MMAP nor ARCH_GENERIC_PCI_MMAP_RESOURCE. For vfio-pci s390 previously relied on disabled VFIO_PCI_MMAP and now relies on setting pdev->non_mappable_bars for all devices. This is partly because access to mapped PCI resources from user-space requires special PCI load/store memory-I/O (MIO) instructions, or the special MMIO syscalls when these are not available. Still, such access is possible and useful not just for RDMA, in fact not being able to mmap() PCI resources has previously caused extra work when testing devices. One thing that doesn't work with PCI resources mapped to user-space though is the s390 specific virtual ISM device. Not only because the BAR size of 256 TiB prevents mapping the whole BAR but also because access requires use of the legacy PCI instructions which are not accessible to user-space on systems with the newer MIO PCI instructions. Now with the pdev->non_mappable_bars flag ISM can be excluded from mapping its resources while making this functionality available for all other PCI devices. To this end introduce a minimal implementation of PCI_QUIRKS and use that to set pdev->non_mappable_bars for ISM devices only. Then also set ARCH_GENERIC_PCI_MMAP_RESOURCE to take advantage of the generic implementation of pci_mmap_resource_range() enabling only the newer sysfs mmap() interface. This follows the recommendation in Documentation/PCI/sysfs-pci.rst. Link: https://lore.kernel.org/r/20250226-vfio_pci_mmap-v7-3-c5c0f1d26efd@linux.ibm.com Signed-off-by: Niklas Schnelle Signed-off-by: Bjorn Helgaas --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index de5deb1a0118..e0cdc290dff5 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -518,6 +518,7 @@ #define PCI_DEVICE_ID_IBM_ICOM_V2_ONE_PORT_RVX_ONE_PORT_MDM 0x0251 #define PCI_DEVICE_ID_IBM_ICOM_V2_ONE_PORT_RVX_ONE_PORT_MDM_PCIE 0x0361 #define PCI_DEVICE_ID_IBM_ICOM_FOUR_PORT_MODEL 0x252 +#define PCI_DEVICE_ID_IBM_ISM 0x04ed #define PCI_SUBVENDOR_ID_IBM 0x1014 #define PCI_SUBDEVICE_ID_IBM_SATURN_SERIAL_ONE_PORT 0x03d4 -- cgit v1.2.3 From ed3ba9b6e280e14cc3148c1b226ba453f02fa76c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sun, 16 Mar 2025 12:28:37 -0700 Subject: net: Remove RTNL dance for SIOCBRADDIF and SIOCBRDELIF. SIOCBRDELIF is passed to dev_ioctl() first and later forwarded to br_ioctl_call(), which causes unnecessary RTNL dance and the splat below [0] under RTNL pressure. Let's say Thread A is trying to detach a device from a bridge and Thread B is trying to remove the bridge. In dev_ioctl(), Thread A bumps the bridge device's refcnt by netdev_hold() and releases RTNL because the following br_ioctl_call() also re-acquires RTNL. In the race window, Thread B could acquire RTNL and try to remove the bridge device. Then, rtnl_unlock() by Thread B will release RTNL and wait for netdev_put() by Thread A. Thread A, however, must hold RTNL after the unlock in dev_ifsioc(), which may take long under RTNL pressure, resulting in the splat by Thread B. Thread A (SIOCBRDELIF) Thread B (SIOCBRDELBR) ---------------------- ---------------------- sock_ioctl sock_ioctl `- sock_do_ioctl `- br_ioctl_call `- dev_ioctl `- br_ioctl_stub |- rtnl_lock | |- dev_ifsioc ' ' |- dev = __dev_get_by_name(...) |- netdev_hold(dev, ...) . / |- rtnl_unlock ------. | | |- br_ioctl_call `---> |- rtnl_lock Race | | `- br_ioctl_stub |- br_del_bridge Window | | | |- dev = __dev_get_by_name(...) | | | May take long | `- br_dev_delete(dev, ...) | | | under RTNL pressure | `- unregister_netdevice_queue(dev, ...) | | | | `- rtnl_unlock \ | |- rtnl_lock <-' `- netdev_run_todo | |- ... `- netdev_run_todo | `- rtnl_unlock |- __rtnl_unlock | |- netdev_wait_allrefs_any |- netdev_put(dev, ...) <----------------' Wait refcnt decrement and log splat below To avoid blocking SIOCBRDELBR unnecessarily, let's not call dev_ioctl() for SIOCBRADDIF and SIOCBRDELIF. In the dev_ioctl() path, we do the following: 1. Copy struct ifreq by get_user_ifreq in sock_do_ioctl() 2. Check CAP_NET_ADMIN in dev_ioctl() 3. Call dev_load() in dev_ioctl() 4. Fetch the master dev from ifr.ifr_name in dev_ifsioc() 3. can be done by request_module() in br_ioctl_call(), so we move 1., 2., and 4. to br_ioctl_stub(). Note that 2. is also checked later in add_del_if(), but it's better performed before RTNL. SIOCBRADDIF and SIOCBRDELIF have been processed in dev_ioctl() since the pre-git era, and there seems to be no specific reason to process them there. [0]: unregister_netdevice: waiting for wpan3 to become free. Usage count = 2 ref_tracker: wpan3@ffff8880662d8608 has 1/1 users at __netdev_tracker_alloc include/linux/netdevice.h:4282 [inline] netdev_hold include/linux/netdevice.h:4311 [inline] dev_ifsioc+0xc6a/0x1160 net/core/dev_ioctl.c:624 dev_ioctl+0x255/0x10c0 net/core/dev_ioctl.c:826 sock_do_ioctl+0x1ca/0x260 net/socket.c:1213 sock_ioctl+0x23a/0x6c0 net/socket.c:1318 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl fs/ioctl.c:892 [inline] __x64_sys_ioctl+0x1a4/0x210 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcb/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 893b19587534 ("net: bridge: fix ioctl locking") Reported-by: syzkaller Reported-by: yan kang Reported-by: yue sun Closes: https://lore.kernel.org/netdev/SY8P300MB0421225D54EB92762AE8F0F2A1D32@SY8P300MB0421.AUSP300.PROD.OUTLOOK.COM/ Signed-off-by: Kuniyuki Iwashima Acked-by: Stanislav Fomichev Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250316192851.19781-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/linux/if_bridge.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 3ff96ae31bf6..c5fe3b2a53e8 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -65,11 +65,9 @@ struct br_ip_list { #define BR_DEFAULT_AGEING_TIME (300 * HZ) struct net_bridge; -void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, - unsigned int cmd, struct ifreq *ifr, +void brioctl_set(int (*hook)(struct net *net, unsigned int cmd, void __user *uarg)); -int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg); +int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg); #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) int br_multicast_list_adjacent(struct net_device *dev, -- cgit v1.2.3 From e9a3682d17d5afee697fc95d3fa342d740767fad Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 27 Oct 2024 20:54:45 +0000 Subject: hwspinlock: Remove unused (devm_)hwspin_lock_request() devm_hwspin_lock_request() was added by 2018's commit 4f1acd758b08 ("hwspinlock: Add devm_xxx() APIs to request/free hwlock") however, it's never been used, everyone uses the devm_hwspin_lock_request_specific() call instead. Remove it. Similarly, the none-devm variant isn't used. Remove it, and the referring documentation. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Baolin Wang Link: https://lore.kernel.org/r/20241027205445.239108-1-linux@treblig.org Signed-off-by: Bjorn Andersson --- include/linux/hwspinlock.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h index f0231dbc4777..2f32d768dfd9 100644 --- a/include/linux/hwspinlock.h +++ b/include/linux/hwspinlock.h @@ -58,7 +58,6 @@ struct hwspinlock_pdata { int hwspin_lock_register(struct hwspinlock_device *bank, struct device *dev, const struct hwspinlock_ops *ops, int base_id, int num_locks); int hwspin_lock_unregister(struct hwspinlock_device *bank); -struct hwspinlock *hwspin_lock_request(void); struct hwspinlock *hwspin_lock_request_specific(unsigned int id); int hwspin_lock_free(struct hwspinlock *hwlock); int of_hwspin_lock_get_id(struct device_node *np, int index); @@ -70,7 +69,6 @@ void __hwspin_unlock(struct hwspinlock *, int, unsigned long *); int of_hwspin_lock_get_id_byname(struct device_node *np, const char *name); int hwspin_lock_bust(struct hwspinlock *hwlock, unsigned int id); int devm_hwspin_lock_free(struct device *dev, struct hwspinlock *hwlock); -struct hwspinlock *devm_hwspin_lock_request(struct device *dev); struct hwspinlock *devm_hwspin_lock_request_specific(struct device *dev, unsigned int id); int devm_hwspin_lock_unregister(struct device *dev, @@ -95,11 +93,6 @@ int devm_hwspin_lock_register(struct device *dev, * Note: ERR_PTR(-ENODEV) will still be considered a success for NULL-checking * users. Others, which care, can still check this with IS_ERR. */ -static inline struct hwspinlock *hwspin_lock_request(void) -{ - return ERR_PTR(-ENODEV); -} - static inline struct hwspinlock *hwspin_lock_request_specific(unsigned int id) { return ERR_PTR(-ENODEV); @@ -155,11 +148,6 @@ int devm_hwspin_lock_free(struct device *dev, struct hwspinlock *hwlock) return 0; } -static inline struct hwspinlock *devm_hwspin_lock_request(struct device *dev) -{ - return ERR_PTR(-ENODEV); -} - static inline struct hwspinlock *devm_hwspin_lock_request_specific(struct device *dev, unsigned int id) -- cgit v1.2.3 From fec04edb74126f21ac628c7be763c97deb49f69d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 15 Dec 2024 02:20:23 +0000 Subject: hwspinlock: Remove unused hwspin_lock_get_id() hwspin_lock_get_id() has been unused since the original 2011 commit bd9a4c7df256 ("drivers: hwspinlock: add framework") Remove it and the corresponding docs. Note that the of_hwspin_lock_get_id() version is still in use, so leave that alone. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Baolin Wang Link: https://lore.kernel.org/r/20241215022023.181435-1-linux@treblig.org Signed-off-by: Bjorn Andersson --- include/linux/hwspinlock.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h index 2f32d768dfd9..f35b42e8c5de 100644 --- a/include/linux/hwspinlock.h +++ b/include/linux/hwspinlock.h @@ -61,7 +61,6 @@ int hwspin_lock_unregister(struct hwspinlock_device *bank); struct hwspinlock *hwspin_lock_request_specific(unsigned int id); int hwspin_lock_free(struct hwspinlock *hwlock); int of_hwspin_lock_get_id(struct device_node *np, int index); -int hwspin_lock_get_id(struct hwspinlock *hwlock); int __hwspin_lock_timeout(struct hwspinlock *, unsigned int, int, unsigned long *); int __hwspin_trylock(struct hwspinlock *, int, unsigned long *); @@ -131,11 +130,6 @@ static inline int of_hwspin_lock_get_id(struct device_node *np, int index) return 0; } -static inline int hwspin_lock_get_id(struct hwspinlock *hwlock) -{ - return 0; -} - static inline int of_hwspin_lock_get_id_byname(struct device_node *np, const char *name) { -- cgit v1.2.3 From 4d09dd11d7d0e7e7f535c0abc7de19b9da6612e9 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 20 Mar 2025 12:44:10 -0700 Subject: pds_fwctl: initial driver framework Initial files for adding a new fwctl driver for the AMD/Pensando PDS devices. This sets up a simple auxiliary_bus driver that registers with fwctl subsystem. It expects that a pds_core device has set up the auxiliary_device pds_core.fwctl Link: https://patch.msgid.link/r/20250320194412.67983-5-shannon.nelson@amd.com Reviewed-by: Leon Romanovsky Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Shannon Nelson Signed-off-by: Jason Gunthorpe --- include/linux/pds/pds_adminq.h | 83 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index 4b4e9a98b37b..22c6d77b3dcb 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -1179,6 +1179,84 @@ struct pds_lm_host_vf_status_cmd { u8 status; }; +enum pds_fwctl_cmd_opcode { + PDS_FWCTL_CMD_IDENT = 70, +}; + +/** + * struct pds_fwctl_cmd - Firmware control command structure + * @opcode: Opcode + * @rsvd: Reserved + * @ep: Endpoint identifier + * @op: Operation identifier + */ +struct pds_fwctl_cmd { + u8 opcode; + u8 rsvd[3]; + __le32 ep; + __le32 op; +} __packed; + +/** + * struct pds_fwctl_comp - Firmware control completion structure + * @status: Status of the firmware control operation + * @rsvd: Reserved + * @comp_index: Completion index in little-endian format + * @rsvd2: Reserved + * @color: Color bit indicating the state of the completion + */ +struct pds_fwctl_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + u8 rsvd2[11]; + u8 color; +} __packed; + +/** + * struct pds_fwctl_ident_cmd - Firmware control identification command structure + * @opcode: Operation code for the command + * @rsvd: Reserved + * @version: Interface version + * @rsvd2: Reserved + * @len: Length of the identification data + * @ident_pa: Physical address of the identification data + */ +struct pds_fwctl_ident_cmd { + u8 opcode; + u8 rsvd; + u8 version; + u8 rsvd2; + __le32 len; + __le64 ident_pa; +} __packed; + +/* future feature bits here + * enum pds_fwctl_features { + * }; + * (compilers don't like empty enums) + */ + +/** + * struct pds_fwctl_ident - Firmware control identification structure + * @features: Supported features (enum pds_fwctl_features) + * @version: Interface version + * @rsvd: Reserved + * @max_req_sz: Maximum request size + * @max_resp_sz: Maximum response size + * @max_req_sg_elems: Maximum number of request SGs + * @max_resp_sg_elems: Maximum number of response SGs + */ +struct pds_fwctl_ident { + __le64 features; + u8 version; + u8 rsvd[3]; + __le32 max_req_sz; + __le32 max_resp_sz; + u8 max_req_sg_elems; + u8 max_resp_sg_elems; +} __packed; + union pds_core_adminq_cmd { u8 opcode; u8 bytes[64]; @@ -1216,6 +1294,9 @@ union pds_core_adminq_cmd { struct pds_lm_dirty_enable_cmd lm_dirty_enable; struct pds_lm_dirty_disable_cmd lm_dirty_disable; struct pds_lm_dirty_seq_ack_cmd lm_dirty_seq_ack; + + struct pds_fwctl_cmd fwctl; + struct pds_fwctl_ident_cmd fwctl_ident; }; union pds_core_adminq_comp { @@ -1243,6 +1324,8 @@ union pds_core_adminq_comp { struct pds_lm_state_size_comp lm_state_size; struct pds_lm_dirty_status_comp lm_dirty_status; + + struct pds_fwctl_comp fwctl; }; #ifndef __CHECKER__ -- cgit v1.2.3 From 92c66ee829b99a860a90f62ef16df3e42f92edac Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 20 Mar 2025 12:44:11 -0700 Subject: pds_fwctl: add rpc and query support The pds_fwctl driver doesn't know what RPC operations are available in the firmware, so also doesn't know what scope they might have. The userland utility supplies the firmware "endpoint" and "operation" id values and this driver queries the firmware for endpoints and their available operations. The operation descriptions include the scope information which the driver uses for scope testing. Link: https://patch.msgid.link/r/20250320194412.67983-6-shannon.nelson@amd.com Reviewed-by: Leon Romanovsky Signed-off-by: Brett Creeley Reviewed-by: Jonathan Cameron Signed-off-by: Shannon Nelson Signed-off-by: Jason Gunthorpe --- include/linux/pds/pds_adminq.h | 194 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index 22c6d77b3dcb..ddd111f04ca0 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -1181,6 +1181,8 @@ struct pds_lm_host_vf_status_cmd { enum pds_fwctl_cmd_opcode { PDS_FWCTL_CMD_IDENT = 70, + PDS_FWCTL_CMD_RPC = 71, + PDS_FWCTL_CMD_QUERY = 72, }; /** @@ -1257,6 +1259,194 @@ struct pds_fwctl_ident { u8 max_resp_sg_elems; } __packed; +enum pds_fwctl_query_entity { + PDS_FWCTL_RPC_ROOT = 0, + PDS_FWCTL_RPC_ENDPOINT = 1, + PDS_FWCTL_RPC_OPERATION = 2, +}; + +#define PDS_FWCTL_RPC_OPCODE_CMD_SHIFT 0 +#define PDS_FWCTL_RPC_OPCODE_CMD_MASK GENMASK(15, PDS_FWCTL_RPC_OPCODE_CMD_SHIFT) +#define PDS_FWCTL_RPC_OPCODE_VER_SHIFT 16 +#define PDS_FWCTL_RPC_OPCODE_VER_MASK GENMASK(23, PDS_FWCTL_RPC_OPCODE_VER_SHIFT) + +#define PDS_FWCTL_RPC_OPCODE_GET_CMD(op) FIELD_GET(PDS_FWCTL_RPC_OPCODE_CMD_MASK, op) +#define PDS_FWCTL_RPC_OPCODE_GET_VER(op) FIELD_GET(PDS_FWCTL_RPC_OPCODE_VER_MASK, op) + +#define PDS_FWCTL_RPC_OPCODE_CMP(op1, op2) \ + (PDS_FWCTL_RPC_OPCODE_GET_CMD(op1) == PDS_FWCTL_RPC_OPCODE_GET_CMD(op2) && \ + PDS_FWCTL_RPC_OPCODE_GET_VER(op1) <= PDS_FWCTL_RPC_OPCODE_GET_VER(op2)) + +/* + * FW command attributes that map to the FWCTL scope values + */ +#define PDSFC_FW_CMD_ATTR_READ 0x00 +#define PDSFC_FW_CMD_ATTR_DEBUG_READ 0x02 +#define PDSFC_FW_CMD_ATTR_WRITE 0x04 +#define PDSFC_FW_CMD_ATTR_DEBUG_WRITE 0x08 +#define PDSFC_FW_CMD_ATTR_SYNC 0x10 + +/** + * struct pds_fwctl_query_cmd - Firmware control query command structure + * @opcode: Operation code for the command + * @entity: Entity type to query (enum pds_fwctl_query_entity) + * @version: Version of the query data structure supported by the driver + * @rsvd: Reserved + * @query_data_buf_len: Length of the query data buffer + * @query_data_buf_pa: Physical address of the query data buffer + * @ep: Endpoint identifier to query (when entity is PDS_FWCTL_RPC_ENDPOINT) + * @op: Operation identifier to query (when entity is PDS_FWCTL_RPC_OPERATION) + * + * This structure is used to send a query command to the firmware control + * interface. The structure is packed to ensure there is no padding between + * the fields. + */ +struct pds_fwctl_query_cmd { + u8 opcode; + u8 entity; + u8 version; + u8 rsvd; + __le32 query_data_buf_len; + __le64 query_data_buf_pa; + union { + __le32 ep; + __le32 op; + }; +} __packed; + +/** + * struct pds_fwctl_query_comp - Firmware control query completion structure + * @status: Status of the query command + * @rsvd: Reserved + * @comp_index: Completion index in little-endian format + * @version: Version of the query data structure returned by firmware. This + * should be less than or equal to the version supported by the driver + * @rsvd2: Reserved + * @color: Color bit indicating the state of the completion + */ +struct pds_fwctl_query_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + u8 version; + u8 rsvd2[2]; + u8 color; +} __packed; + +/** + * struct pds_fwctl_query_data_endpoint - query data for entity PDS_FWCTL_RPC_ROOT + * @id: The identifier for the data endpoint + */ +struct pds_fwctl_query_data_endpoint { + __le32 id; +} __packed; + +/** + * struct pds_fwctl_query_data_operation - query data for entity PDS_FWCTL_RPC_ENDPOINT + * @id: Operation identifier + * @scope: Scope of the operation (enum fwctl_rpc_scope) + * @rsvd: Reserved + */ +struct pds_fwctl_query_data_operation { + __le32 id; + u8 scope; + u8 rsvd[3]; +} __packed; + +/** + * struct pds_fwctl_query_data - query data structure + * @version: Version of the query data structure + * @rsvd: Reserved + * @num_entries: Number of entries in the union + * @entries: Array of query data entries, depending on the entity type + */ +struct pds_fwctl_query_data { + u8 version; + u8 rsvd[3]; + __le32 num_entries; + u8 entries[] __counted_by_le(num_entries); +} __packed; + +/** + * struct pds_fwctl_rpc_cmd - Firmware control RPC command + * @opcode: opcode PDS_FWCTL_CMD_RPC + * @rsvd: Reserved + * @flags: Indicates indirect request and/or response handling + * @ep: Endpoint identifier + * @op: Operation identifier + * @inline_req0: Buffer for inline request + * @inline_req1: Buffer for inline request + * @req_pa: Physical address of request data + * @req_sz: Size of the request + * @req_sg_elems: Number of request SGs + * @req_rsvd: Reserved + * @inline_req2: Buffer for inline request + * @resp_pa: Physical address of response data + * @resp_sz: Size of the response + * @resp_sg_elems: Number of response SGs + * @resp_rsvd: Reserved + */ +struct pds_fwctl_rpc_cmd { + u8 opcode; + u8 rsvd; + __le16 flags; +#define PDS_FWCTL_RPC_IND_REQ 0x1 +#define PDS_FWCTL_RPC_IND_RESP 0x2 + __le32 ep; + __le32 op; + u8 inline_req0[16]; + union { + u8 inline_req1[16]; + struct { + __le64 req_pa; + __le32 req_sz; + u8 req_sg_elems; + u8 req_rsvd[3]; + }; + }; + union { + u8 inline_req2[16]; + struct { + __le64 resp_pa; + __le32 resp_sz; + u8 resp_sg_elems; + u8 resp_rsvd[3]; + }; + }; +} __packed; + +/** + * struct pds_sg_elem - Transmit scatter-gather (SG) descriptor element + * @addr: DMA address of SG element data buffer + * @len: Length of SG element data buffer, in bytes + * @rsvd: Reserved + */ +struct pds_sg_elem { + __le64 addr; + __le32 len; + u8 rsvd[4]; +} __packed; + +/** + * struct pds_fwctl_rpc_comp - Completion of a firmware control RPC + * @status: Status of the command + * @rsvd: Reserved + * @comp_index: Completion index of the command + * @err: Error code, if any, from the RPC + * @resp_sz: Size of the response + * @rsvd2: Reserved + * @color: Color bit indicating the state of the completion + */ +struct pds_fwctl_rpc_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + __le32 err; + __le32 resp_sz; + u8 rsvd2[3]; + u8 color; +} __packed; + union pds_core_adminq_cmd { u8 opcode; u8 bytes[64]; @@ -1297,6 +1487,8 @@ union pds_core_adminq_cmd { struct pds_fwctl_cmd fwctl; struct pds_fwctl_ident_cmd fwctl_ident; + struct pds_fwctl_rpc_cmd fwctl_rpc; + struct pds_fwctl_query_cmd fwctl_query; }; union pds_core_adminq_comp { @@ -1326,6 +1518,8 @@ union pds_core_adminq_comp { struct pds_lm_dirty_status_comp lm_dirty_status; struct pds_fwctl_comp fwctl; + struct pds_fwctl_rpc_comp fwctl_rpc; + struct pds_fwctl_query_comp fwctl_query; }; #ifndef __CHECKER__ -- cgit v1.2.3 From fa23a338de93aa03eb0b6146a0440f5762309f85 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Mar 2025 13:36:11 +0000 Subject: mm: separate folio_split_memcg_refs() from split_page_memcg() Patch series "Minor memcg cleanups & prep for memdescs", v2. Separate the handling of accounted folios and GFP_ACCOUNT pages for easier to understand code. For more detail, see https://lore.kernel.org/linux-mm/Z9LwTOudOlCGny3f@casper.infradead.org/ This patch (of 5): Folios always use memcg_data to refer to the mem_cgroup while pages allocated with GFP_ACCOUNT have a pointer to the obj_cgroup. Since the caller already knows what it has, split the function into two and then we don't need to check. Move the assignment of split folio memcg_data to the point where we set up the other parts of the new folio. That leaves folio_split_memcg_refs() just handling the memcg accounting. Link: https://lkml.kernel.org/r/20250314133617.138071-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250314133617.138071-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Zi Yan Acked-by: Roman Gushchin Cc: David Hildenbrand Cc: Matthew Wilcow (Oracle) Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 57664e2a8fb7..d090089c5497 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1039,6 +1039,8 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, } void split_page_memcg(struct page *head, int old_order, int new_order); +void folio_split_memcg_refs(struct folio *folio, unsigned old_order, + unsigned new_order); static inline u64 cgroup_id_from_mm(struct mm_struct *mm) { @@ -1463,6 +1465,11 @@ static inline void split_page_memcg(struct page *head, int old_order, int new_or { } +static inline void folio_split_memcg_refs(struct folio *folio, + unsigned old_order, unsigned new_order) +{ +} + static inline u64 cgroup_id_from_mm(struct mm_struct *mm) { return 0; -- cgit v1.2.3 From 1506c25508acd740ced5e92c539ed3d12f622c5b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Mar 2025 13:36:12 +0000 Subject: mm: simplify split_page_memcg() The last argument to split_page_memcg() is now always 0, so remove it, effectively reverting commit b8791381d7ed. Link: https://lkml.kernel.org/r/20250314133617.138071-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Zi Yan Acked-by: Roman Gushchin Cc: David Hildenbrand Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d090089c5497..ea28cacfb0d2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1038,7 +1038,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, rcu_read_unlock(); } -void split_page_memcg(struct page *head, int old_order, int new_order); +void split_page_memcg(struct page *first, unsigned order); void folio_split_memcg_refs(struct folio *folio, unsigned old_order, unsigned new_order); @@ -1461,7 +1461,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void split_page_memcg(struct page *head, int old_order, int new_order) +static inline void split_page_memcg(struct page *first, unsigned order) { } -- cgit v1.2.3 From 8492936abb49eda26e00eab01e58d7e69f2355ba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Mar 2025 13:36:14 +0000 Subject: mm: simplify folio_memcg_charged() There's no need to check which kind of pointer is in the memcg_data field, all we actually care about is whether it's zero or not. Saves 70 bytes in workingset_activation() with the Debian config. Link: https://lkml.kernel.org/r/20250314133617.138071-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Cc: David Hildenbrand Cc: Michal Hocko Cc: Muchun Song Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ea28cacfb0d2..53364526d877 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -438,9 +438,7 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) */ static inline bool folio_memcg_charged(struct folio *folio) { - if (folio_memcg_kmem(folio)) - return __folio_objcg(folio) != NULL; - return __folio_memcg(folio) != NULL; + return folio->memcg_data != 0; } /* -- cgit v1.2.3 From 835de37603ef6412949df0a607128f5fffed4576 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Fri, 14 Mar 2025 15:37:54 -0600 Subject: meminfo: add a per node counter for balloon drivers Patch series "track memory used by balloon drivers", v2. This series introduces a way to track memory used by balloon drivers. Add a NR_BALLOON_PAGES counter to track how many pages are reclaimed by the balloon drivers. First add the accounting, then updates the balloon drivers (virtio, Hyper-V, VMware, Pseries-cmm, and Xen) to maintain this counter. The virtio, Vmware, and pseries-cmm balloon drivers utilize the balloon_compaction interface to allocate and free balloon pages. Other balloon drivers will have to maintain this counter manually. This makes the information visible in memory reporting interfaces like /proc/meminfo, show_mem, and OOM reporting. This provides admins visibility into their VM balloon sizes without requiring different virtualization tooling. Furthermore, this information is helpful when debugging an OOM inside a VM. This patch (of 4): Add NR_BALLOON_PAGES counter to track memory used by balloon drivers and expose it through /proc/meminfo and other memory reporting interfaces. [npache@redhat.com: document Balloon Meminfo entry] Link: https://lkml.kernel.org/r/a0315ccf-f244-460e-8643-fd7388724fe5@redhat.com Link: https://lkml.kernel.org/r/20250314213757.244258-1-npache@redhat.com Link: https://lkml.kernel.org/r/20250314213757.244258-2-npache@redhat.com Signed-off-by: Nico Pache Cc: Alexander Atanasov Cc: Chengming Zhou Cc: David Hildenbrand Cc: Dexuan Cui Cc: Haiyang Zhang Cc: Johannes Weiner Cc: Juegren Gross Cc: Kanchana P Sridhar Cc: K. Y. Srinivasan Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Muchun Song Cc: Nhat Pham Cc: Oleksandr Tyshchenko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Stefano Stabellini Cc: Wei Liu Cc: Michael Kelley Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 37c29f3fbca8..a9db0fbd2b94 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -224,6 +224,7 @@ enum node_stat_item { #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif + NR_BALLOON_PAGES, NR_VM_NODE_STAT_ITEMS }; -- cgit v1.2.3 From 3b23a44f1f196741596616082e759f7f3a400e78 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 18 Mar 2025 11:30:28 -0700 Subject: mm/damon: implement a new DAMOS filter type for active pages Patch series "mm/damon: introduce DAMOS filter type for active pages". The memory reclaim algorithm categorizes pages into active and inactive lists, separately for file and anon pages. The system's performance relies heavily on the (relative and absolute) accuracy of this categorization. This patch series add a new DAMOS filter for pages' activeness, giving us visibility into the access frequency of the pages on each list. This insight can help us diagnose issues with the active-inactive balancing dynamics, and make decisions to optimize reclaim efficiency and memory utilization. For instance, we might decide to enable DAMON_LRU_SORT, if we find that there are pages on the active list that are infrequently accessed, or less frequently accessed than pages on the inactive list. This patch (of 2): Implement a DAMOS filter type for active pages on DAMON kernel API, and add support of it from the physical address space DAMON operations set (paddr). Link: https://lkml.kernel.org/r/20250318183029.2062917-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20250318183029.2062917-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3db4f77261f5..47e36e6ea203 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -334,6 +334,7 @@ struct damos_stat { /** * enum damos_filter_type - Type of memory for &struct damos_filter * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. + * @DAMOS_FILTER_TYPE_ACTIVE: Active pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_YOUNG: Recently accessed pages. * @DAMOS_FILTER_TYPE_HUGEPAGE_SIZE: Page is part of a hugepage. @@ -355,6 +356,7 @@ struct damos_stat { */ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, + DAMOS_FILTER_TYPE_ACTIVE, DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_YOUNG, DAMOS_FILTER_TYPE_HUGEPAGE_SIZE, -- cgit v1.2.3 From e452872b40e3f1fb92adf0d573a0a6a7c9f6ce22 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Tue, 18 Mar 2025 15:58:32 +0800 Subject: mm: vmscan: split proactive reclaim statistics from direct reclaim statistics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Adding Proactive Memory Reclaim Statistics". These two patches are related to proactive memory reclaim. Patch 1 Split proactive reclaim statistics from direct reclaim counters and introduces new counters: pgsteal_proactive, pgdemote_proactive, and pgscan_proactive. Patch 2 Adds pswpin and pswpout items to the cgroup-v2 documentation. This patch (of 2): In proactive memory reclaim scenarios, it is necessary to accurately track proactive reclaim statistics to dynamically adjust the frequency and amount of memory being reclaimed proactively. Currently, proactive reclaim is included in direct reclaim statistics, which can make these direct reclaim statistics misleading. Therefore, separate proactive reclaim memory from the direct reclaim counters by introducing new counters: pgsteal_proactive, pgdemote_proactive, and pgscan_proactive, to avoid confusion with direct reclaim. Link: https://lkml.kernel.org/r/20250318075833.90615-1-jiahao.kernel@gmail.com Link: https://lkml.kernel.org/r/20250318075833.90615-2-jiahao.kernel@gmail.com Signed-off-by: Hao Jia Acked-by: Johannes Weiner Cc: Jonathan Corbet Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + include/linux/vm_event_item.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a9db0fbd2b94..f7fe1126dc75 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -221,6 +221,7 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, + PGDEMOTE_PROACTIVE, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index f70d0958095c..f11b6fa9c5b3 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -41,9 +41,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSTEAL_KHUGEPAGED, + PGSTEAL_PROACTIVE, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, + PGSCAN_PROACTIVE, PGSCAN_DIRECT_THROTTLE, PGSCAN_ANON, PGSCAN_FILE, -- cgit v1.2.3 From 5f5ee52d4f58605330b09851273d6e56aaadd29e Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Tue, 18 Mar 2025 16:39:38 +0800 Subject: mm/hwpoison: introduce folio_contain_hwpoisoned_page() helper Patch series "mm/vmscan: don't try to reclaim hwpoison folio". Fix a bug during memory reclaim if folio is hwpoisoned. This patch (of 2): Introduce helper folio_contain_hwpoisoned_page() to check if the entire folio is hwpoisoned or it contains hwpoisoned pages. Link: https://lkml.kernel.org/r/20250318083939.987651-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20250318083939.987651-2-tujinjiang@huawei.com Signed-off-by: Jinjiang Tu Acked-by: Miaohe Lin Cc: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f26ce54c7aa4..68f4b188fc6b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1098,6 +1098,12 @@ static inline bool is_page_hwpoison(const struct page *page) return folio_test_hugetlb(folio) && PageHWPoison(&folio->page); } +static inline bool folio_contain_hwpoisoned_page(struct folio *folio) +{ + return folio_test_hwpoison(folio) || + (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)); +} + bool is_free_buddy_page(const struct page *page); PAGEFLAG(Isolated, isolated, PF_ANY); -- cgit v1.2.3 From 3cf67d61ff98672120f6ad07528afa165df12588 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 25 Feb 2025 16:02:34 +0900 Subject: hung_task: show the blocker task if the task is hung on mutex Patch series "hung_task: Dump the blocking task stacktrace", v4. The hung_task detector is very useful for detecting the lockup. However, since it only dumps the blocked (uninterruptible sleep) processes, it is not enough to identify the root cause of that lockup. For example, if a process holds a mutex and sleep an event in interruptible state long time, the other processes will wait on the mutex in uninterruptible state. In this case, the waiter processes are dumped, but the blocker process is not shown because it is sleep in interruptible state. This adds a feature to dump the blocker task which holds a mutex when detecting a hung task. e.g. INFO: task cat:115 blocked for more than 122 seconds. Not tainted 6.14.0-rc3-00003-ga8946be3de00 #156 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:cat state:D stack:13432 pid:115 tgid:115 ppid:106 task_flags:0x400100 flags:0x00000002 Call Trace: __schedule+0x731/0x960 ? schedule_preempt_disabled+0x54/0xa0 schedule+0xb7/0x140 ? __mutex_lock+0x51b/0xa60 ? __mutex_lock+0x51b/0xa60 schedule_preempt_disabled+0x54/0xa0 __mutex_lock+0x51b/0xa60 read_dummy+0x23/0x70 full_proxy_read+0x6a/0xc0 vfs_read+0xc2/0x340 ? __pfx_direct_file_splice_eof+0x10/0x10 ? do_sendfile+0x1bd/0x2e0 ksys_read+0x76/0xe0 do_syscall_64+0xe3/0x1c0 ? exc_page_fault+0xa9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x4840cd RSP: 002b:00007ffe99071828 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd RDX: 0000000000001000 RSI: 00007ffe99071870 RDI: 0000000000000003 RBP: 00007ffe99071870 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000 R13: 00000000132fd3a0 R14: 0000000000000001 R15: ffffffffffffffff INFO: task cat:115 is blocked on a mutex likely owned by task cat:114. task:cat state:S stack:13432 pid:114 tgid:114 ppid:106 task_flags:0x400100 flags:0x00000002 Call Trace: __schedule+0x731/0x960 ? schedule_timeout+0xa8/0x120 schedule+0xb7/0x140 schedule_timeout+0xa8/0x120 ? __pfx_process_timeout+0x10/0x10 msleep_interruptible+0x3e/0x60 read_dummy+0x2d/0x70 full_proxy_read+0x6a/0xc0 vfs_read+0xc2/0x340 ? __pfx_direct_file_splice_eof+0x10/0x10 ? do_sendfile+0x1bd/0x2e0 ksys_read+0x76/0xe0 do_syscall_64+0xe3/0x1c0 ? exc_page_fault+0xa9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x4840cd RSP: 002b:00007ffe3e0147b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd RDX: 0000000000001000 RSI: 00007ffe3e014800 RDI: 0000000000000003 RBP: 00007ffe3e014800 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000 R13: 000000001a0a93a0 R14: 0000000000000001 R15: ffffffffffffffff TBD: We can extend this feature to cover other locks like rwsem and rt_mutex, but rwsem requires to dump all the tasks which acquire and wait that rwsem. We can follow the waiter link but the output will be a bit different compared with mutex case. This patch (of 2): The "hung_task" shows a long-time uninterruptible slept task, but most often, it's blocked on a mutex acquired by another task. Without dumping such a task, investigating the root cause of the hung task problem is very difficult. This introduce task_struct::blocker_mutex to point the mutex lock which this task is waiting for. Since the mutex has "owner" information, we can find the owner task and dump it with hung tasks. Note: the owner can be changed while dumping the owner task, so this is "likely" the owner of the mutex. With this change, the hung task shows blocker task's info like below; INFO: task cat:115 blocked for more than 122 seconds. Not tainted 6.14.0-rc3-00003-ga8946be3de00 #156 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:cat state:D stack:13432 pid:115 tgid:115 ppid:106 task_flags:0x400100 flags:0x00000002 Call Trace: __schedule+0x731/0x960 ? schedule_preempt_disabled+0x54/0xa0 schedule+0xb7/0x140 ? __mutex_lock+0x51b/0xa60 ? __mutex_lock+0x51b/0xa60 schedule_preempt_disabled+0x54/0xa0 __mutex_lock+0x51b/0xa60 read_dummy+0x23/0x70 full_proxy_read+0x6a/0xc0 vfs_read+0xc2/0x340 ? __pfx_direct_file_splice_eof+0x10/0x10 ? do_sendfile+0x1bd/0x2e0 ksys_read+0x76/0xe0 do_syscall_64+0xe3/0x1c0 ? exc_page_fault+0xa9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x4840cd RSP: 002b:00007ffe99071828 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd RDX: 0000000000001000 RSI: 00007ffe99071870 RDI: 0000000000000003 RBP: 00007ffe99071870 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000 R13: 00000000132fd3a0 R14: 0000000000000001 R15: ffffffffffffffff INFO: task cat:115 is blocked on a mutex likely owned by task cat:114. task:cat state:S stack:13432 pid:114 tgid:114 ppid:106 task_flags:0x400100 flags:0x00000002 Call Trace: __schedule+0x731/0x960 ? schedule_timeout+0xa8/0x120 schedule+0xb7/0x140 schedule_timeout+0xa8/0x120 ? __pfx_process_timeout+0x10/0x10 msleep_interruptible+0x3e/0x60 read_dummy+0x2d/0x70 full_proxy_read+0x6a/0xc0 vfs_read+0xc2/0x340 ? __pfx_direct_file_splice_eof+0x10/0x10 ? do_sendfile+0x1bd/0x2e0 ksys_read+0x76/0xe0 do_syscall_64+0xe3/0x1c0 ? exc_page_fault+0xa9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x4840cd RSP: 002b:00007ffe3e0147b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004840cd RDX: 0000000000001000 RSI: 00007ffe3e014800 RDI: 0000000000000003 RBP: 00007ffe3e014800 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000001000000 R11: 0000000000000246 R12: 0000000000001000 R13: 000000001a0a93a0 R14: 0000000000000001 R15: ffffffffffffffff [akpm@linux-foundation.org: implement debug_show_blocker() in C rather than in CPP] Link: https://lkml.kernel.org/r/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com Link: https://lkml.kernel.org/r/174046695384.2194069.16796289525958195643.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Waiman Long Reviewed-by: Lance Yang Reviewed-by: Sergey Senozhatsky Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: Kent Overstreet Cc: Steven Rostedt Cc: Tomasz Figa Cc: Will Deacon Cc: Yongliang Gao Signed-off-by: Andrew Morton --- include/linux/mutex.h | 2 ++ include/linux/sched.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 2bf91b57591b..2143d05116be 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -202,4 +202,6 @@ DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T)) DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T)) DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0) +extern unsigned long mutex_get_owner(struct mutex *lock); + #endif /* __LINUX_MUTEX_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c15365a30c0..1419d94c8e87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1217,6 +1217,10 @@ struct task_struct { struct mutex_waiter *blocked_on; #endif +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + struct mutex *blocker_mutex; +#endif + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif -- cgit v1.2.3 From 8ee065a6fdd285387d0eca5e1139ffb182a6e626 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Mar 2025 20:11:10 +0200 Subject: resource: split DEFINE_RES_NAMED_DESC() out of DEFINE_RES_NAMED() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "resource: Split and use DEFINE_RES*() macros", v2. Replace open coded variants of DEFINE_RES*() macros. Note, there are many more possibilities over the kernel and even in reources.c, however the latter contains not so trivial leftovers. That's why the examples cover only straightforward conversions. This patch (of 4): In some cases it would be useful to supply predefined descriptor of the resource. For this, introduce DEFINE_RES_NAMED_DESC() macro. While at it, provide DEFINE_RES() that takes only start, size, and flags. Link: https://lkml.kernel.org/r/20250317181412.1560630-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20250317181412.1560630-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Andrew Morton --- include/linux/ioport.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5385349f0b8a..e8b2d6aa4013 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -154,15 +154,20 @@ enum { }; /* helpers to define resources */ -#define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ +#define DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, _desc) \ (struct resource) { \ .start = (_start), \ .end = (_start) + (_size) - 1, \ .name = (_name), \ .flags = (_flags), \ - .desc = IORES_DESC_NONE, \ + .desc = (_desc), \ } +#define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ + DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, IORES_DESC_NONE) +#define DEFINE_RES(_start, _size, _flags) \ + DEFINE_RES_NAMED(_start, _size, NULL, _flags) + #define DEFINE_RES_IO_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_IO) #define DEFINE_RES_IO(_start, _size) \ -- cgit v1.2.3 From 75845c6c1a64483e9985302793dbf0dfa5f71e32 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Mar 2025 15:57:46 +0000 Subject: keys: Fix UAF in key_put() Once a key's reference count has been reduced to 0, the garbage collector thread may destroy it at any time and so key_put() is not allowed to touch the key after that point. The most key_put() is normally allowed to do is to touch key_gc_work as that's a static global variable. However, in an effort to speed up the reclamation of quota, this is now done in key_put() once the key's usage is reduced to 0 - but now the code is looking at the key after the deadline, which is forbidden. Fix this by using a flag to indicate that a key can be gc'd now rather than looking at the key's refcount in the garbage collector. Fixes: 9578e327b2b4 ("keys: update key quotas in key_put()") Reported-by: syzbot+6105ffc1ded71d194d6d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/673b6aec.050a0220.87769.004a.GAE@google.com/ Signed-off-by: David Howells Tested-by: syzbot+6105ffc1ded71d194d6d@syzkaller.appspotmail.com Reviewed-by: Oleg Nesterov Signed-off-by: Jarkko Sakkinen --- include/linux/key.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index 074dca3222b9..ba05de8579ec 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -236,6 +236,7 @@ struct key { #define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 8 /* set if key should not be removed */ #define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ +#define KEY_FLAG_FINAL_PUT 10 /* set if final put has happened on key */ /* the key type and key description string * - the desc is used to match a key against search criteria -- cgit v1.2.3 From ca1914a32cdcad26c4b003df743fe4f9e4bb2877 Mon Sep 17 00:00:00 2001 From: Ihor Matushchak Date: Sun, 16 Mar 2025 08:15:51 +0100 Subject: net: phy: phy_interface_t: Fix RGMII_TXID code comment Fix copy-paste error in the code comment for Interface Mode definitions. The code refers to Internal TX delay, not Internal RX delay. It was likely copied from the line above this one. Signed-off-by: Ihor Matushchak Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250316071551.9794-1-ihor.matushchak@foobox.net Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 60d3b8860ea2..bfdbdc538910 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -81,7 +81,7 @@ extern const int phy_basic_ports_array[3]; * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay - * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay + * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal TX delay * @PHY_INTERFACE_MODE_RTBI: Reduced TBI * @PHY_INTERFACE_MODE_SMII: Serial MII * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface -- cgit v1.2.3 From 1937a0be28c01a13e18912602b8eff08d7db77cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Mar 2025 08:53:13 +0000 Subject: tcp: move icsk_clean_acked to a better location As a followup of my presentation in Zagreb for netdev 0x19: icsk_clean_acked is only used by TCP when/if CONFIG_TLS_DEVICE is enabled from tcp_ack(). Rename it to tcp_clean_acked, move it to tcp_sock structure in the tcp_sock_read_rx for better cache locality in TCP fast path. Define this field only when CONFIG_TLS_DEVICE is enabled saving 8 bytes on configs not using it. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Sabrina Dubroca Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250317085313.2023214-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 159b2c59eb62..1669d95bb0f9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -244,6 +244,9 @@ struct tcp_sock { struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; +#if defined(CONFIG_TLS_DEVICE) + void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); +#endif u32 snd_ssthresh; /* Slow start size threshold */ u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); -- cgit v1.2.3 From c353e8983e0dea5dbba7789033326e1ad34135b7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Mar 2025 19:22:38 +0100 Subject: net: introduce per netns packet chains Currently network taps unbound to any interface are linked in the global ptype_all list, affecting the performance in all the network namespaces. Add per netns ptypes chains, so that in the mentioned case only the netns owning the packet socket(s) is affected. While at that drop the global ptype_all list: no in kernel user registers a tap on "any" type without specifying either the target device or the target namespace (and IMHO doing that would not make any sense). Note that this adds a conditional in the fast path (to check for per netns ptype_specific list) and increases the dataset size by a cacheline (owing the per netns lists). Reviewed-by: Sabrina Dubroca Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/ae405f98875ee87f8150c460ad162de7e466f8a7.1742494826.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0c5b1f7f8f3a..f22cca7c03ad 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4278,7 +4278,17 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev, return 0; } -bool dev_nit_active(struct net_device *dev); +bool dev_nit_active_rcu(const struct net_device *dev); +static inline bool dev_nit_active(const struct net_device *dev) +{ + bool ret; + + rcu_read_lock(); + ret = dev_nit_active_rcu(dev); + rcu_read_unlock(); + return ret; +} + void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); static inline void __dev_put(struct net_device *dev) -- cgit v1.2.3 From 26f80681a09b95fd64bcf33d02e258e78a30842b Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 5 Mar 2025 15:03:54 +0100 Subject: sched: Add sched tracepoints for RV task model Add the following tracepoints: * sched_entry(bool preempt, ip) Called while entering __schedule * sched_exit(bool is_switch, ip) Called while exiting __schedule * sched_set_state(task, curr_state, state) Called when a task changes its state (to and from running) These tracepoints are useful to describe the Linux task model and are adapted from the patches by Daniel Bristot de Oliveira (https://bristot.me/linux-task-model/). Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Juri Lelli Link: https://lore.kernel.org/20250305140406.350227-2-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Acked-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (Google) --- include/linux/rv.h | 2 +- include/linux/sched.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rv.h b/include/linux/rv.h index 8883b41d88ec..55d458be53a4 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -7,7 +7,7 @@ #ifndef _LINUX_RV_H #define _LINUX_RV_H -#define MAX_DA_NAME_LEN 24 +#define MAX_DA_NAME_LEN 32 #ifdef CONFIG_RV /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 9632e3318e0d..45a61a0b9790 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -46,6 +46,7 @@ #include #include #include +#include #include /* task_struct member predeclarations (sorted alphabetically): */ @@ -186,6 +187,12 @@ struct user_event_mm; # define debug_rtlock_wait_restore_state() do { } while (0) #endif +#define trace_set_current_state(state_value) \ + do { \ + if (tracepoint_enabled(sched_set_state_tp)) \ + __trace_set_current_state(state_value); \ + } while (0) + /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to @@ -226,12 +233,14 @@ struct user_event_mm; #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ smp_store_mb(current->__state, (state_value)); \ } while (0) @@ -247,6 +256,7 @@ struct user_event_mm; \ raw_spin_lock_irqsave(¤t->pi_lock, flags); \ debug_special_state_change((state_value)); \ + trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) @@ -282,6 +292,7 @@ struct user_event_mm; raw_spin_lock(¤t->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ + trace_set_current_state(TASK_RTLOCK_WAIT); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(¤t->pi_lock); \ } while (0); @@ -291,6 +302,7 @@ struct user_event_mm; lockdep_assert_irqs_disabled(); \ raw_spin_lock(¤t->pi_lock); \ debug_rtlock_wait_restore_state(); \ + trace_set_current_state(current->saved_state); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(¤t->pi_lock); \ @@ -327,6 +339,10 @@ extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); +/* wrapper function to trace from this header file */ +DECLARE_TRACEPOINT(sched_set_state_tp); +extern void __trace_set_current_state(int state_value); + /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode -- cgit v1.2.3 From cb85c660fcd4b3a03ed993affa0b2d1a8af2f06b Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 5 Mar 2025 15:03:55 +0100 Subject: rv: Add option for nested monitors and include sched Monitors describing complex systems, such as the scheduler, can easily grow to the point where they are just hard to understand because of the many possible state transitions. Often it is possible to break such descriptions into smaller monitors, sharing some or all events. Enabling those smaller monitors concurrently is, in fact, testing the system as if we had one single larger monitor. Splitting models into multiple specification is not only easier to understand, but gives some more clues when we see errors. Add the possibility to create container monitors, whose only purpose is to host other nested monitors. Enabling a container monitor enables all nested ones, but it's still possible to enable nested monitors independently. Add the sched monitor as first container, for now empty. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Link: https://lore.kernel.org/20250305140406.350227-3-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- include/linux/rv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rv.h b/include/linux/rv.h index 55d458be53a4..3452b5e4b29e 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -56,7 +56,7 @@ struct rv_monitor { bool rv_monitoring_on(void); int rv_unregister_monitor(struct rv_monitor *monitor); -int rv_register_monitor(struct rv_monitor *monitor); +int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent); int rv_get_task_monitor_slot(void); void rv_put_task_monitor_slot(int slot); -- cgit v1.2.3 From 24fe172b50b5749c315349e740e4a09e3a0165d5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 24 Mar 2025 14:56:01 -0700 Subject: objtool: Fix up some outdated references to ENTRY/ENDPROC ENTRY and ENDPROC were deprecated years ago and replaced with SYM_FUNC_{START,END}. Fix up a few outdated references in the objtool documentation and comments. Also fix a few typos. Suggested-by: Brendan Jackman Suggested-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/5eb7e06e1a0e87aaeda8d583ab060e7638a6ea8e.1742852846.git.jpoimboe@kernel.org --- include/linux/linkage.h | 4 ---- include/linux/objtool.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 5c8865bb59d9..b11660b706c5 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -134,10 +134,6 @@ .size name, .-name #endif -/* If symbol 'name' is treated as a subroutine (gets called, and returns) - * then please use ENDPROC to mark 'name' as STT_FUNC for the benefit of - * static analysis tools such as stack depth analyzer. - */ #ifndef ENDPROC /* deprecated, use SYM_FUNC_END */ #define ENDPROC(name) \ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 3ca965a2ddc8..366ad004d794 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -69,7 +69,7 @@ * In asm, there are two kinds of code: normal C-type callable functions and * the rest. The normal callable functions can be called by other code, and * don't do anything unusual with the stack. Such normal callable functions - * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * are annotated with SYM_FUNC_{START,END}. Most asm code falls in this * category. In this case, no special debugging annotations are needed because * objtool can automatically generate the ORC data for the ORC unwinder to read * at runtime. -- cgit v1.2.3 From 6aa63a4ec947f350d1a2f9f6aba8591a2455d192 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 24 Mar 2025 21:05:15 -0700 Subject: iommu: Sort out domain user data When DMA/MSI cookies were made first-class citizens back in commit 46983fcd67ac ("iommu: Pull IOVA cookie management into the core"), there was no real need to further expose the two different cookie types. However, now that IOMMUFD wants to add a third type of MSI-mapping cookie, we do have a nicely compelling reason to properly dismabiguate things at the domain level beyond just vaguely guessing from the domain type. Meanwhile, we also effectively have another "cookie" in the form of the anonymous union for other user data, which isn't much better in terms of being vague and unenforced. The fact is that all these cookie types are mutually exclusive, in the sense that combining them makes zero sense and/or would be catastrophic (iommu_set_fault_handler() on an SVA domain, anyone?) - the only combination which *might* be reasonable is perhaps a fault handler and an MSI cookie, but nobody's doing that at the moment, so let's rule it out as well for the sake of being clear and robust. To that end, we pull DMA and MSI cookies apart a little more, mostly to clear up the ambiguity at domain teardown, then for clarity (and to save a little space), move them into the union, whose ownership we can then properly describe and enforce entirely unambiguously. [nicolinc: rebase on latest tree; use prefix IOMMU_COOKIE_; merge unions in iommu_domain; add IOMMU_COOKIE_IOMMUFD for iommufd_hwpt] Link: https://patch.msgid.link/r/1ace9076c95204bbe193ee77499d395f15f44b23.1742871535.git.nicolinc@nvidia.com Signed-off-by: Robin Murphy Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e93d2e918599..06cc14e9993d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -41,6 +41,7 @@ struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; +struct iommu_dma_msi_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; @@ -165,6 +166,15 @@ struct iommu_domain_geometry { bool force_aperture; /* DMA only allowed in mappable range? */ }; +enum iommu_domain_cookie_type { + IOMMU_COOKIE_NONE, + IOMMU_COOKIE_DMA_IOVA, + IOMMU_COOKIE_DMA_MSI, + IOMMU_COOKIE_FAULT_HANDLER, + IOMMU_COOKIE_SVA, + IOMMU_COOKIE_IOMMUFD, +}; + /* Domain feature flags */ #define __IOMMU_DOMAIN_PAGING (1U << 0) /* Support for iommu_map/unmap */ #define __IOMMU_DOMAIN_DMA_API (1U << 1) /* Domain for use in DMA-API @@ -211,12 +221,12 @@ struct iommu_domain_geometry { struct iommu_domain { unsigned type; + enum iommu_domain_cookie_type cookie_type; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; const struct iommu_ops *owner; /* Whose domain_alloc we came from */ unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; - struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) @@ -224,10 +234,10 @@ struct iommu_domain { phys_addr_t msi_addr); #endif - union { /* Pointer usable by owner of the domain */ - struct iommufd_hw_pagetable *iommufd_hwpt; /* iommufd */ - }; - union { /* Fault handler */ + union { /* cookie */ + struct iommu_dma_cookie *iova_cookie; + struct iommu_dma_msi_cookie *msi_cookie; + struct iommufd_hw_pagetable *iommufd_hwpt; struct { iommu_fault_handler_t handler; void *handler_token; -- cgit v1.2.3 From 06d54f00f3f5a29cbf43410ac93ee2dd89e3b711 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 24 Mar 2025 21:05:17 -0700 Subject: iommu: Drop sw_msi from iommu_domain There are only two sw_msi implementations in the entire system, thus it's not very necessary to have an sw_msi pointer. Instead, check domain->cookie_type to call the two sw_msi implementations directly from the core code. Link: https://patch.msgid.link/r/7ded87c871afcbaac665b71354de0a335087bf0f.1742871535.git.nicolinc@nvidia.com Suggested-by: Robin Murphy Reviewed-by: Robin Murphy Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 06cc14e9993d..e01c855ae8a7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -229,11 +229,6 @@ struct iommu_domain { struct iommu_domain_geometry geometry; int (*iopf_handler)(struct iopf_group *group); -#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) - int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr); -#endif - union { /* cookie */ struct iommu_dma_cookie *iova_cookie; struct iommu_dma_msi_cookie *msi_cookie; @@ -254,16 +249,6 @@ struct iommu_domain { }; }; -static inline void iommu_domain_set_sw_msi( - struct iommu_domain *domain, - int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr)) -{ -#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) - domain->sw_msi = sw_msi; -#endif -} - static inline bool iommu_is_dma_domain(struct iommu_domain *domain) { return domain->type & __IOMMU_DOMAIN_DMA_API; -- cgit v1.2.3 From 2fb69c602d57f77483b8dcdd12d17408a09f76fe Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:33 -0700 Subject: iommufd: Support pasid attach/replace This extends the below APIs to support PASID. Device drivers to manage pasid attach/replace/detach. int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, u32 *pt_id); int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, u32 *pt_id); void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid); The pasid operations share underlying attach/replace/detach infrastructure with the device operations, but still have some different implications: - no reserved region per pasid otherwise SVA architecture is already broken (CPU address space doesn't count device reserved regions); - accordingly no sw_msi trick; Cache coherency enforcement is still applied to pasid operations since it is about memory accesses post page table walking (no matter the walk is per RID or per PASID). Link: https://patch.msgid.link/r/20250321171940.7213-12-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Signed-off-by: Kevin Tian Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 60eff9272551..34b6e6ca4bfa 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -54,9 +55,11 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id); void iommufd_device_unbind(struct iommufd_device *idev); -int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); -int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id); -void iommufd_device_detach(struct iommufd_device *idev); +int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id); +int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id); +void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); u32 iommufd_device_to_id(struct iommufd_device *idev); -- cgit v1.2.3 From 7fe6b987166b901efc5c6fce5fe853c9ebb835be Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:39 -0700 Subject: ida: Add ida_find_first_range() There is no helpers for user to check if a given ID is allocated or not, neither a helper to loop all the allocated IDs in an IDA and do something for cleanup. With the two needs, a helper to get the lowest allocated ID of a range and two variants based on it. Caller can check if a given ID is allocated or not by: bool ida_exists(struct ida *ida, unsigned int id) Caller can iterate all allocated IDs by: int id; while ((id = ida_find_first(&pasid_ida)) >= 0) { //anything to do with the allocated ID ida_free(pasid_ida, pasid); } Link: https://patch.msgid.link/r/20250321180143.8468-2-yi.l.liu@intel.com Cc: Matthew Wilcox (Oracle) Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Matthew Wilcox (Oracle) Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/idr.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index da5f5fa4a3a6..718f9b1b91af 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -257,6 +257,7 @@ struct ida { int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); void ida_destroy(struct ida *ida); +int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max); /** * ida_alloc() - Allocate an unused ID. @@ -328,4 +329,14 @@ static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); } + +static inline bool ida_exists(struct ida *ida, unsigned int id) +{ + return ida_find_first_range(ida, id, id) == id; +} + +static inline int ida_find_first(struct ida *ida) +{ + return ida_find_first_range(ida, 0, ~0); +} #endif /* __IDR_H__ */ -- cgit v1.2.3 From 290641346d0d1eaf400c4f968d5b2cd91f483733 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:40 -0700 Subject: vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices This adds pasid_at|de]tach_ioas ops for attaching hwpt to pasid of a device and the helpers for it. For now, only vfio-pci supports pasid attach/detach. Link: https://patch.msgid.link/r/20250321180143.8468-3-yi.l.liu@intel.com Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/vfio.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 000a6cab2d31..707b00772ce1 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -67,6 +67,7 @@ struct vfio_device { struct inode *inode; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; + struct ida pasids; u8 iommufd_attached:1; #endif u8 cdev_opened:1; @@ -91,6 +92,8 @@ struct vfio_device { * bound iommufd. Undo in unbind_iommufd if @detach_ioas is not * called. * @detach_ioas: Opposite of attach_ioas + * @pasid_attach_ioas: The pasid variation of attach_ioas + * @pasid_detach_ioas: Opposite of pasid_attach_ioas * @open_device: Called when the first file descriptor is opened for this device * @close_device: Opposite of open_device * @read: Perform read(2) on device file descriptor @@ -115,6 +118,9 @@ struct vfio_device_ops { void (*unbind_iommufd)(struct vfio_device *vdev); int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); void (*detach_ioas)(struct vfio_device *vdev); + int (*pasid_attach_ioas)(struct vfio_device *vdev, u32 pasid, + u32 *pt_id); + void (*pasid_detach_ioas)(struct vfio_device *vdev, u32 pasid); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -139,6 +145,10 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, void vfio_iommufd_physical_unbind(struct vfio_device *vdev); int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev); +int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev, + u32 pasid, u32 *pt_id); +void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev, + u32 pasid); int vfio_iommufd_emulated_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); @@ -166,6 +176,10 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) #define vfio_iommufd_physical_detach_ioas \ ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_physical_pasid_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 pasid, u32 *pt_id)) NULL) +#define vfio_iommufd_physical_pasid_detach_ioas \ + ((void (*)(struct vfio_device *vdev, u32 pasid)) NULL) #define vfio_iommufd_emulated_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ u32 *out_device_id)) NULL) -- cgit v1.2.3 From 367f1854d442b33c4a0305b068ae40d67ccd7d6a Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 20 Mar 2025 22:11:07 +0000 Subject: net: phylink: add phylink_prepare_resume() When the system is suspended, the PHY may be placed in low-power mode by setting the BMCR 0.11 Power down bit. IEEE 802.3 states that the behaviour of the PHY in this state is implementation specific, and the PHY is not required to meet the RX_CLK and TX_CLK requirements. Essentially, this means that a PHY may stop the clocks that it is generating while in power down state. However, MACs exist which require the clocks from the PHY to be running in order to properly resume. phylink_prepare_resume() provides them with a way to clear the Power down bit early. Note, however, that IEEE 802.3 gives PHYs up to 500ms grace before the transmit and receive clocks meet the requirements after clearing the power down bit. Add a resume preparation function, which will ensure that the receive clock from the PHY is appropriately configured while resuming. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tvO6V-008Vjb-AP@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 79876c84ae81..06f1b649f173 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -707,6 +707,7 @@ void phylink_start(struct phylink *); void phylink_stop(struct phylink *); void phylink_suspend(struct phylink *pl, bool mac_wol); +void phylink_prepare_resume(struct phylink *pl); void phylink_resume(struct phylink *pl); void phylink_ethtool_get_wol(struct phylink *, struct ethtool_wolinfo *); -- cgit v1.2.3 From ddf4bd3f738485c84edb98ff96a5759904498e70 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 20 Mar 2025 22:11:22 +0000 Subject: net: phylink: add functions to block/unblock rx clock stop Some MACs require the PHY receive clock to be running to complete setup actions. This may fail if the PHY has negotiated EEE, the MAC supports receive clock stop, and the link has entered LPI state. Provide a pair of APIs that MAC drivers can use to temporarily block the PHY disabling the receive clock. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tvO6k-008Vjt-MZ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 06f1b649f173..1f5773ab5660 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -706,6 +706,9 @@ int phylink_pcs_pre_init(struct phylink *pl, struct phylink_pcs *pcs); void phylink_start(struct phylink *); void phylink_stop(struct phylink *); +void phylink_rx_clk_stop_block(struct phylink *); +void phylink_rx_clk_stop_unblock(struct phylink *); + void phylink_suspend(struct phylink *pl, bool mac_wol); void phylink_prepare_resume(struct phylink *pl); void phylink_resume(struct phylink *pl); -- cgit v1.2.3 From 1f6154227b49c3d3f306f624858e695bfee50aae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Mar 2025 09:14:27 -0700 Subject: Revert "udp_tunnel: GRO optimizations" Revert "udp_tunnel: use static call for GRO hooks when possible" This reverts commit 311b36574ceaccfa3f91b74054a09cd4bb877702. Revert "udp_tunnel: create a fastpath GRO lookup." This reverts commit 8d4880db378350f8ed8969feea13bdc164564fc1. There are multiple small issues with the series. In the interest of unblocking the merge window let's opt for a revert. Link: https://lore.kernel.org/cover.1742557254.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 895240177f4f..0807e21cfec9 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,13 +101,6 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; - - /* - * Accounting for the tunnel GRO fastpath. - * Unprotected by compilers guard, as it uses space available in - * the last UDP socket cacheline. - */ - struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -226,13 +219,4 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) -static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) -{ -#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) - return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); -#else - return NULL; -#endif -} - #endif /* _LINUX_UDP_H */ -- cgit v1.2.3 From 983e0e4e87bdf465e8424b1902e41bfe51ba128a Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 18 Mar 2025 21:06:42 +0200 Subject: net-timestamp: COMPLETION timestamp on packet tx completion Add SOF_TIMESTAMPING_TX_COMPLETION, for requesting a software timestamp when hardware reports a packet completed. Completion tstamp is useful for Bluetooth, as hardware timestamps do not exist in the HCI specification except for ISO packets, and the hardware has a queue where packets may wait. In this case the software SND timestamp only reflects the kernel-side part of the total latency (usually small) and queue length (usually 0 unless HW buffers congested), whereas the completion report time is more informative of the true latency. It may also be useful in other cases where HW TX timestamps cannot be obtained and user wants to estimate an upper bound to when the TX probably happened. Signed-off-by: Pauli Virtanen Reviewed-by: Willem de Bruijn Signed-off-by: Luiz Augusto von Dentz --- include/linux/skbuff.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cd8294cdc249..b974a277975a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -478,8 +478,8 @@ enum { /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, - /* reserved */ - SKBTX_RESERVED = 1 << 3, + /* generate software time stamp on packet tx completion */ + SKBTX_COMPLETION_TSTAMP = 1 << 3, /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, @@ -498,7 +498,8 @@ enum { #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP | \ - SKBTX_BPF) + SKBTX_BPF | \ + SKBTX_COMPLETION_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ SKBTX_ANY_SW_TSTAMP) -- cgit v1.2.3 From bae2da826196ff4ab439b57683dce883e274faef Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 24 Mar 2025 15:45:28 -0700 Subject: net: remove netif_set_real_num_rx_queues() helper for when SYSFS=n Since commit a953be53ce40 ("net-sysfs: add support for device-specific rx queue sysfs attributes"), so for at least a decade now it is safe to call net_rx_queue_update_kobjects() when SYSFS=n. That function does its own ifdef-inery and will return 0. Remove the unnecessary stub for netif_set_real_num_rx_queues(). Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250324224537.248800-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f22cca7c03ad..55859c565f84 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4062,17 +4062,7 @@ static inline bool netif_is_multiqueue(const struct net_device *dev) } int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); - -#ifdef CONFIG_SYSFS int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); -#else -static inline int netif_set_real_num_rx_queues(struct net_device *dev, - unsigned int rxqs) -{ - dev->real_num_rx_queues = rxqs; - return 0; -} -#endif int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq); -- cgit v1.2.3 From 4b702f8b72c7b05daa1b763fdc0840aa78178c3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 24 Mar 2025 15:45:30 -0700 Subject: net: explain "protection types" for the instance lock Try to define some terminology for which fields are protected by which lock and how. Some fields are protected by both rtnl_lock and instance lock which is hard to talk about without having a "key phrase" to refer to a particular protection scheme. "ops protected" fields are defined later in the series, one by one. Add ASSERT_RTNL() to netdev_ops_assert_locked() for drivers not other instance protection of ops. Hopefully it's not too confusion that netdev_lock_ops() does not match the lock which netdev_ops_assert_locked() will assert, exactly. The noun "ops" is in a different place in the name, so I think it's acceptable... Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250324224537.248800-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 55859c565f84..2b91fb96a411 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2496,19 +2496,35 @@ struct net_device { * Should always be taken using netdev_lock() / netdev_unlock() helpers. * Drivers are free to use it for other protection. * - * Protects: + * For the drivers that implement shaper or queue API, the scope + * of this lock is expanded to cover most ndo/queue/ethtool/sysfs + * operations. Drivers may opt-in to this behavior by setting + * @request_ops_lock. + * + * @lock protection mixes with rtnl_lock in multiple ways, fields are + * either: + * + * - simply protected by the instance @lock; + * + * - double protected - writers hold both locks, readers hold either; + * + * - ops protected - protected by the lock held around the NDOs + * and other callbacks, that is the instance lock on devices for + * which netdev_need_ops_lock() returns true, otherwise by rtnl_lock; + * + * - double ops protected - always protected by rtnl_lock but for + * devices for which netdev_need_ops_lock() returns true - also + * the instance lock. + * + * Simply protects: * @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list, * @net_shaper_hierarchy, @reg_state, @threaded * - * Partially protects (writers must hold both @lock and rtnl_lock): + * Double protects: * @up * * Also protects some fields in struct napi_struct. * - * For the drivers that implement shaper or queue API, the scope - * of this lock is expanded to cover most ndo/queue/ethtool/sysfs - * operations. - * * Ordering: take after rtnl_lock. */ struct mutex lock; -- cgit v1.2.3 From 0a65dcf6249b75c841b4218426b0d246a805c7e0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 24 Mar 2025 15:45:31 -0700 Subject: net: designate queue counts as "double ops protected" by instance lock Drivers which opt into instance lock protection of ops should only call set_real_num_*_queues() under the instance lock. This means that queue counts are double protected (writes are under both rtnl_lock and instance lock, readers under either). Some readers may still be under the rtnl_lock, however, so for now we need double protection of writers. OTOH queue API paths are only under the protection of the instance lock, so we need to validate that the instance is actually locking ops, otherwise the input checks we do against queue count are racy. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250324224537.248800-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2b91fb96a411..60ef367d8575 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2523,6 +2523,9 @@ struct net_device { * Double protects: * @up * + * Double ops protects: + * @real_num_rx_queues, @real_num_tx_queues + * * Also protects some fields in struct napi_struct. * * Ordering: take after rtnl_lock. -- cgit v1.2.3 From 310ae9eb2617c62deedef8f121d7ca1ae774fa76 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 24 Mar 2025 15:45:32 -0700 Subject: net: designate queue -> napi linking as "ops protected" netdev netlink is the only reader of netdev_{,rx_}queue->napi, and it already holds netdev->lock. Switch protection of the writes to netdev->lock to "ops protected". The expectation will be now that accessing queue->napi will require netdev->lock for "ops locked" drivers, and rtnl_lock for all other drivers. Current "ops locked" drivers don't require any changes. gve and netdevsim use _locked() helpers right next to netif_queue_set_napi() so they must be holding the instance lock. iavf doesn't call it. bnxt is a bit messy but all paths seem locked. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250324224537.248800-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 60ef367d8575..fa79145518d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -710,7 +710,7 @@ struct netdev_queue { * slow- / control-path part */ /* NAPI instance for the queue - * Readers and writers must hold RTNL + * "ops protected", see comment about net_device::lock */ struct napi_struct *napi; @@ -2526,7 +2526,8 @@ struct net_device { * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues * - * Also protects some fields in struct napi_struct. + * Also protects some fields in: + * struct napi_struct, struct netdev_queue, struct netdev_rx_queue * * Ordering: take after rtnl_lock. */ -- cgit v1.2.3 From 9e6ec8cf64e2973f0ec74f09023988cabd218426 Mon Sep 17 00:00:00 2001 From: xueqin Luo Date: Thu, 6 Feb 2025 16:14:36 +0800 Subject: thermal: core: Remove duplicate struct declaration The struct thermal_zone_device is already declared on line 32, so the duplicate declaration has been removed. Fixes: b1ae92dcfa8e ("thermal: core: Make struct thermal_zone_device definition internal") Signed-off-by: xueqin Luo Link: https://lore.kernel.org/r/20250206081436.51785-1-luoxueqin@kylinos.cn Signed-off-by: Daniel Lezcano --- include/linux/thermal.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 69f9bedd0ee8..0b5ed6821080 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -86,8 +86,6 @@ struct thermal_trip { #define THERMAL_TRIP_PRIV_TO_INT(_val_) (uintptr_t)(_val_) #define THERMAL_INT_TO_TRIP_PRIV(_val_) (void *)(uintptr_t)(_val_) -struct thermal_zone_device; - struct cooling_spec { unsigned long upper; /* Highest cooling state */ unsigned long lower; /* Lowest cooling state */ -- cgit v1.2.3 From dc84bc2aba85a1508f04a936f9f9a15f64ebfb31 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Mar 2025 12:23:23 +0100 Subject: x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range() If track_pfn_copy() fails, we already added the dst VMA to the maple tree. As fork() fails, we'll cleanup the maple tree, and stumble over the dst VMA for which we neither performed any reservation nor copied any page tables. Consequently untrack_pfn() will see VM_PAT and try obtaining the PAT information from the page table -- which fails because the page table was not copied. The easiest fix would be to simply clear the VM_PAT flag of the dst VMA if track_pfn_copy() fails. However, the whole thing is about "simply" clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy() and performed a reservation, but copying the page tables fails, we'll simply clear the VM_PAT flag, not properly undoing the reservation ... which is also wrong. So let's fix it properly: set the VM_PAT flag only if the reservation succeeded (leaving it clear initially), and undo the reservation if anything goes wrong while copying the page tables: clearing the VM_PAT flag after undoing the reservation. Note that any copied page table entries will get zapped when the VMA will get removed later, after copy_page_range() succeeded; as VM_PAT is not set then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be happy. Note that leaving these page tables in place without a reservation is not a problem, as we are aborting fork(); this process will never run. A reproducer can trigger this usually at the first try: https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110 Modules linked in: ... CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 RIP: 0010:get_pat_info+0xf6/0x110 ... Call Trace: ... untrack_pfn+0x52/0x110 unmap_single_vma+0xa6/0xe0 unmap_vmas+0x105/0x1f0 exit_mmap+0xf6/0x460 __mmput+0x4b/0x120 copy_process+0x1bf6/0x2aa0 kernel_clone+0xab/0x440 __do_sys_clone+0x66/0x90 do_syscall_64+0x95/0x180 Likely this case was missed in: d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") ... and instead of undoing the reservation we simply cleared the VM_PAT flag. Keep the documentation of these functions in include/linux/pgtable.h, one place is more than sufficient -- we should clean that up for the other functions like track_pfn_remap/untrack_pfn separately. Fixes: d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed") Fixes: 2ab640379a0a ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3") Reported-by: xingwei lee Reported-by: yuxin wang Reported-by: Marius Fleischer Signed-off-by: David Hildenbrand Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Rik van Riel Cc: "H. Peter Anvin" Cc: Linus Torvalds Cc: Andrew Morton Cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20250321112323.153741-1-david@redhat.com Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/ Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/ --- include/linux/pgtable.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94d267d02372..4c107e17c547 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1508,14 +1508,25 @@ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, } /* - * track_pfn_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). + * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page + * tables copied during copy_page_range(). On success, stores the pfn to be + * passed to untrack_pfn_copy(). */ -static inline int track_pfn_copy(struct vm_area_struct *vma) +static inline int track_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, unsigned long *pfn) { return 0; } +/* + * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during + * copy_page_range(), but after track_pfn_copy() was already called. + */ +static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, + unsigned long pfn) +{ +} + /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or @@ -1528,8 +1539,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma, } /* - * untrack_pfn_clear is called while mremapping a pfnmap for a new region - * or fails to copy pgtable during duplicate vm area. + * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA: + * + * 1) During mremap() on the src VMA after the page tables were moved. + * 2) During fork() on the dst VMA, immediately after duplicating the src VMA. */ static inline void untrack_pfn_clear(struct vm_area_struct *vma) { @@ -1540,7 +1553,10 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long size); extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); -extern int track_pfn_copy(struct vm_area_struct *vma); +extern int track_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, unsigned long *pfn); +extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, + unsigned long pfn); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked); extern void untrack_pfn_clear(struct vm_area_struct *vma); -- cgit v1.2.3 From 2c5ac026fd1421cf6a78770b48570b2563ef40b7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Mar 2025 16:39:29 +0200 Subject: =?UTF-8?q?net:=20phy:=20Introduce=20PHY=5FID=5FSIZE=20=E2=80=94?= =?UTF-8?q?=20minimum=20size=20for=20PHY=20ID=20string?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PHY_ID_FMT defines the format specifier "%s:%02x" to form the PHY ID string, where the maximum of the first part is defined in MII_BUS_ID_SIZE, including NUL terminator, and the second part is implied to be 3 as the maximum address is limited to 32, meaning that 2 hex digits is more than enough, plus ':' (colon) delimiter. However, some drivers, which are using PHY_ID_FMT, customise buffer size and do that incorrectly. Introduce a new constant PHY_ID_SIZE that makes the minimum required size explicit, so drivers are encouraged to use it. Suggested-by: "Russell King (Oracle)" Signed-off-by: Andy Shevchenko Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250324144751.1271761-2-andriy.shevchenko@linux.intel.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index bfdbdc538910..a2bfae80c449 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -292,6 +292,7 @@ static inline long rgmii_clock(int speed) /* Used when trying to connect to a specific phy (mii bus id:phy device id) */ #define PHY_ID_FMT "%s:%02x" +#define PHY_ID_SIZE (MII_BUS_ID_SIZE + 3) #define MII_BUS_ID_SIZE 61 -- cgit v1.2.3 From 4b313c69a38e28b2f002198c3909fb553e9b0176 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 10 Mar 2025 12:10:21 +0100 Subject: PCI: endpoint: Add intx_capable to epc_features struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In struct pci_epc_features, an EPC driver can already specify if they support MSI (by setting msi_capable) and MSI-X (by setting msix_capable). Thus, for consistency, allow an EPC driver to specify if it supports INTx interrupts as well (by setting intx_capable). Since this struct is zero initialized, EPC drivers that want to claim INTx support will need to set intx_capable to true. Signed-off-by: Niklas Cassel [kwilczynski: add missing kernel-doc for "intx_capable"] Signed-off-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20250310111016.859445-13-cassel@kernel.org --- include/linux/pci-epc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index e818e3fdcded..9d14d78abd14 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -221,6 +221,7 @@ struct pci_epc_bar_desc { * @linkup_notifier: indicate if the EPC device can notify EPF driver on link up * @msi_capable: indicate if the endpoint function has MSI capability * @msix_capable: indicate if the endpoint function has MSI-X capability + * @intx_capable: indicate if the endpoint can raise INTx interrupts * @bar: array specifying the hardware description for each BAR * @align: alignment size required for BAR buffer allocation */ @@ -228,6 +229,7 @@ struct pci_epc_features { unsigned int linkup_notifier : 1; unsigned int msi_capable : 1; unsigned int msix_capable : 1; + unsigned int intx_capable : 1; struct pci_epc_bar_desc bar[PCI_STD_NUM_BARS]; size_t align; }; -- cgit v1.2.3 From 73b6dacb1c6feae8ca4a6ff120848430aeb57fbd Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 25 Mar 2025 08:39:42 -0600 Subject: io_uring/net: use REQ_F_IMPORT_BUFFER for send_zc Instead of a bool field in struct io_sr_msg, use REQ_F_IMPORT_BUFFER to track whether io_send_zc() has already imported the buffer. This flag already serves a similar purpose for sendmsg_zc and {read,write}v_fixed. Signed-off-by: Caleb Sander Mateos Suggested-by: Pavel Begunkov Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20250325143943.1226467-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index c17d2eedf478..699e2c0895ae 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -585,7 +585,10 @@ enum { REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), /* request has read/write metadata assigned */ REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), - /* resolve padded iovec to registered buffers */ + /* + * For vectored fixed buffers, resolve iovec to registered buffers. + * For SEND_ZC, whether to import buffers (i.e. the first issue). + */ REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), }; -- cgit v1.2.3 From 9b08a16637eeef4d6d3a8a3b69714e8930676248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 20 Mar 2025 20:06:50 +0100 Subject: lsm: Add audit_log_lsm_data() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract code from dump_common_audit_data() into the audit_log_lsm_data() helper. This helps reuse common LSM audit data while not abusing AUDIT_AVC records because of the common_lsm_audit() helper. Depends-on: 7ccbe076d987 ("lsm: Only build lsm_audit.c if CONFIG_SECURITY and CONFIG_AUDIT are set") Cc: Casey Schaufler Cc: James Morris Cc: Serge E. Hallyn Acked-by: Paul Moore Link: https://lore.kernel.org/r/20250320190717.2287696-2-mic@digikod.net Reviewed-by: Günther Noack Signed-off-by: Mickaël Salaün --- include/linux/lsm_audit.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index e13d2f947b51..bddd694f7c4c 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -132,6 +132,9 @@ void common_lsm_audit(struct common_audit_data *a, void (*pre_audit)(struct audit_buffer *, void *), void (*post_audit)(struct audit_buffer *, void *)); +void audit_log_lsm_data(struct audit_buffer *ab, + const struct common_audit_data *a); + #else /* CONFIG_AUDIT */ static inline void common_lsm_audit(struct common_audit_data *a, @@ -140,6 +143,11 @@ static inline void common_lsm_audit(struct common_audit_data *a, { } +static inline void audit_log_lsm_data(struct audit_buffer *ab, + const struct common_audit_data *a) +{ +} + #endif /* CONFIG_AUDIT */ #endif -- cgit v1.2.3 From 98bbabbc12dce03da8473edd0bc8d7072d723769 Mon Sep 17 00:00:00 2001 From: Chun-Kuang Hu Date: Sun, 9 Feb 2025 02:11:02 +0000 Subject: mailbox: mtk-cmdq: remove cl in struct cmdq_pkt Every client driver has the struct cmdq_client information, so it's not necessary to store it in struct cmdq_pkt. cl is used to store struct cmdq_client information and now no client driver use it, so remove it. Signed-off-by: Chun-Kuang Hu Signed-off-by: Jassi Brar --- include/linux/mailbox/mtk-cmdq-mailbox.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h index a8f0070c7aa9..4c1a91b07de3 100644 --- a/include/linux/mailbox/mtk-cmdq-mailbox.h +++ b/include/linux/mailbox/mtk-cmdq-mailbox.h @@ -75,7 +75,6 @@ struct cmdq_pkt { dma_addr_t pa_base; size_t cmd_buf_size; /* command occupied size */ size_t buf_size; /* real buffer size */ - void *cl; }; u8 cmdq_get_shift_pa(struct mbox_chan *chan); -- cgit v1.2.3 From db824c1119fc16556a84cb7a771ca6553b3c3a45 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Mon, 24 Feb 2025 08:27:15 +0000 Subject: mailbox: sort headers alphabetically Sorting headers alphabetically helps locating duplicates, and makes it easier to figure out where to insert new headers. Signed-off-by: Tudor Ambarus Signed-off-by: Jassi Brar --- include/linux/mailbox_client.h | 2 +- include/linux/mailbox_controller.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h index 734694912ef7..c6eea9afb943 100644 --- a/include/linux/mailbox_client.h +++ b/include/linux/mailbox_client.h @@ -7,8 +7,8 @@ #ifndef __MAILBOX_CLIENT_H #define __MAILBOX_CLIENT_H -#include #include +#include struct mbox_chan; diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index 6fee33cb52f5..5fb0b65f45a2 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -3,11 +3,11 @@ #ifndef __MAILBOX_CONTROLLER_H #define __MAILBOX_CONTROLLER_H +#include +#include +#include #include #include -#include -#include -#include struct mbox_chan; -- cgit v1.2.3 From 6889ae1b4df1579bcdffef023e2ea9a982565dff Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Mar 2025 09:57:27 +0000 Subject: io_uring/net: fix io_req_post_cqe abuse by send bundle [ 114.987980][ T5313] WARNING: CPU: 6 PID: 5313 at io_uring/io_uring.c:872 io_req_post_cqe+0x12e/0x4f0 [ 114.991597][ T5313] RIP: 0010:io_req_post_cqe+0x12e/0x4f0 [ 115.001880][ T5313] Call Trace: [ 115.002222][ T5313] [ 115.007813][ T5313] io_send+0x4fe/0x10f0 [ 115.009317][ T5313] io_issue_sqe+0x1a6/0x1740 [ 115.012094][ T5313] io_wq_submit_work+0x38b/0xed0 [ 115.013223][ T5313] io_worker_handle_work+0x62a/0x1600 [ 115.013876][ T5313] io_wq_worker+0x34f/0xdf0 As the comment states, io_req_post_cqe() should only be used by multishot requests, i.e. REQ_F_APOLL_MULTISHOT, which bundled sends are not. Add a flag signifying whether a request wants to post multiple CQEs. Eventually REQ_F_APOLL_MULTISHOT should imply the new flag, but that's left out for simplicity. Cc: stable@vger.kernel.org Fixes: a05d1f625c7aa ("io_uring/net: support bundles for send") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8b611dbb54d1cd47a88681f5d38c84d0c02bc563.1743067183.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 699e2c0895ae..b44d201520d8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -490,6 +490,7 @@ enum { REQ_F_SKIP_LINK_CQES_BIT, REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, + REQ_F_MULTISHOT_BIT, REQ_F_APOLL_MULTISHOT_BIT, REQ_F_CLEAR_POLLIN_BIT, /* keep async read/write and isreg together and in order */ @@ -567,6 +568,8 @@ enum { REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), /* double poll may active */ REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), + /* request posts multiple completions, should be set at prep time */ + REQ_F_MULTISHOT = IO_REQ_FLAG(REQ_F_MULTISHOT_BIT), /* fast poll multishot mode */ REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), /* recvmsg special flag, clear EPOLLIN */ -- cgit v1.2.3 From de9e33df7762abbfc2a1568291f2c3a3154c6a9d Mon Sep 17 00:00:00 2001 From: Jonathan McDowell Date: Wed, 12 Mar 2025 07:26:18 +0200 Subject: tpm, tpm_tis: Workaround failed command reception on Infineon devices Some Infineon devices have a issue where the status register will get stuck with a quick REQUEST_USE / COMMAND_READY sequence. This is not simply a matter of requiring a longer timeout; the work around is to retry the command submission. Add appropriate logic to do this in the send path. This is fixed in later firmware revisions, but those are not always available, and cannot generally be easily updated from outside a firmware environment. Testing has been performed with a simple repeated loop of doing a TPM2_CC_GET_CAPABILITY for TPM_CAP_PROP_MANUFACTURER using the Go code at: https://the.earth.li/~noodles/tpm-stuff/timeout-reproducer-simple.go It can take several hours to reproduce, and several million operations. Signed-off-by: Jonathan McDowell Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 20a40ade8030..6c3125300c00 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -335,6 +335,7 @@ enum tpm2_cc_attrs { #define TPM_VID_WINBOND 0x1050 #define TPM_VID_STM 0x104A #define TPM_VID_ATML 0x1114 +#define TPM_VID_IFX 0x15D1 enum tpm_chip_flags { TPM_CHIP_FLAG_BOOTSTRAPPED = BIT(0), -- cgit v1.2.3 From 923936efeb74b3f42e5ad283a0b9110bda102601 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 28 Mar 2025 01:01:19 +0800 Subject: iomap: Fix conflicting values of iomap flags IOMAP_F_ATOMIC_BIO mistakenly took the same value as of IOMAP_F_SIZE_CHANGED in patch '370a6de7651b ("iomap: rework IOMAP atomic flags")'. Let's fix this and let's also create some more space for filesystem reported flags to avoid this in future. This patch makes the core iomap flags to start from bit 15, moving downwards. Note that "flags" member within struct iomap is of type u16. Fixes: 370a6de7651b ("iomap: rework IOMAP atomic flags") Signed-off-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20250327170119.61045-1-ritesh.list@gmail.com Reviewed-by: John Garry Signed-off-by: Christian Brauner --- include/linux/iomap.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 02fe001feebb..68416b135151 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -78,6 +78,11 @@ struct vm_fault; #define IOMAP_F_ANON_WRITE (1U << 7) #define IOMAP_F_ATOMIC_BIO (1U << 8) +/* + * Flag reserved for file system specific usage + */ +#define IOMAP_F_PRIVATE (1U << 12) + /* * Flags set by the core iomap code during operations: * @@ -88,14 +93,8 @@ struct vm_fault; * range it covers needs to be remapped by the high level before the operation * can proceed. */ -#define IOMAP_F_SIZE_CHANGED (1U << 8) -#define IOMAP_F_STALE (1U << 9) - -/* - * Flags from 0x1000 up are for file system specific usage: - */ -#define IOMAP_F_PRIVATE (1U << 12) - +#define IOMAP_F_SIZE_CHANGED (1U << 14) +#define IOMAP_F_STALE (1U << 15) /* * Magic value for addr: -- cgit v1.2.3 From 079a206f51f0c183be96a2f78f183dee42df1d4a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Mar 2025 16:40:47 +0200 Subject: seq_buf: Mark binary printing functions with __printf() attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binary printing functions are using printf() type of format, and compiler is not happy about them as is: lib/seq_buf.c:162:17: error: function ‘seq_buf_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] Fix the compilation errors by adding __printf() attribute. Signed-off-by: Andy Shevchenko Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250321144822.324050-2-andriy.shevchenko@linux.intel.com Signed-off-by: Petr Mladek --- include/linux/seq_buf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index fe41da005970..52791e070506 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -167,8 +167,8 @@ extern int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, const void *buf, size_t len, bool ascii); #ifdef CONFIG_BINARY_PRINTF -extern int -seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); +__printf(2, 0) +int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); #endif void seq_buf_do_printk(struct seq_buf *s, const char *lvl); -- cgit v1.2.3 From 6b2c1e30ad6846624d935a7ea98dae60458126b8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Mar 2025 16:40:48 +0200 Subject: seq_file: Mark binary printing functions with __printf() attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binary printing functions are using printf() type of format, and compiler is not happy about them as is: fs/seq_file.c:418:35: error: function ‘seq_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] Fix the compilation errors by adding __printf() attribute. Signed-off-by: Andy Shevchenko Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250321144822.324050-3-andriy.shevchenko@linux.intel.com Signed-off-by: Petr Mladek --- include/linux/seq_file.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 2fb266ea69fa..d6ebf0596510 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -181,6 +181,7 @@ int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); #ifdef CONFIG_BINARY_PRINTF +__printf(2, 0) void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary); #endif -- cgit v1.2.3 From 196a062641fe68d9bfe0ad36b6cd7628c99ad22c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Mar 2025 16:40:49 +0200 Subject: tracing: Mark binary printing functions with __printf() attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binary printing functions are using printf() type of format, and compiler is not happy about them as is: kernel/trace/trace.c:3292:9: error: function ‘trace_vbprintk’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] kernel/trace/trace_seq.c:182:9: error: function ‘trace_seq_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] Fix the compilation errors by adding __printf() attribute. While at it, move existing __printf() attributes from the implementations to the declarations. IT also fixes incorrect attribute parameters that are used for trace_array_printk(). Signed-off-by: Andy Shevchenko Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250321144822.324050-4-andriy.shevchenko@linux.intel.com Signed-off-by: Petr Mladek --- include/linux/trace.h | 4 ++-- include/linux/trace_seq.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace.h b/include/linux/trace.h index fdcd76b7be83..7eaad857dee0 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -72,8 +72,8 @@ static inline int unregister_ftrace_export(struct trace_export *export) static inline void trace_printk_init_buffers(void) { } -static inline int trace_array_printk(struct trace_array *tr, unsigned long ip, - const char *fmt, ...) +static inline __printf(3, 4) +int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { return 0; } diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 1ef95c0287f0..a93ed5ac3226 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -88,8 +88,8 @@ extern __printf(2, 3) void trace_seq_printf(struct trace_seq *s, const char *fmt, ...); extern __printf(2, 0) void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args); -extern void -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); +extern __printf(2, 0) +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int trace_print_seq(struct seq_file *m, struct trace_seq *s); extern int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt); @@ -113,8 +113,8 @@ static inline __printf(2, 3) void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { } -static inline void -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +static inline __printf(2, 0) +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { } -- cgit v1.2.3 From 7bf819aa992faee980610e9021aec0c38aac53be Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Mar 2025 16:40:50 +0200 Subject: vsnprintf: Mark binary printing functions with __printf() attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Binary printf() functions are using printf() type of format, and compiler is not happy about them as is: lib/vsprintf.c:3130:47: error: function ‘vbin_printf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] lib/vsprintf.c:3298:33: error: function ‘bstr_printf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] Fix the compilation errors by adding __printf() attribute. Signed-off-by: Andy Shevchenko Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250321144822.324050-5-andriy.shevchenko@linux.intel.com Signed-off-by: Petr Mladek --- include/linux/string.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 0403a4ca4c11..01621ad0f598 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -336,8 +336,8 @@ int __sysfs_match_string(const char * const *array, size_t n, const char *s); #define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s) #ifdef CONFIG_BINARY_PRINTF -int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); -int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); +__printf(3, 0) int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); +__printf(3, 0) int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, -- cgit v1.2.3 From bcba8d4dbe6880ce9883409df486de35d3946704 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:40 -0500 Subject: ring-buffer: Use kaslr address instead of text delta Instead of saving off the text and data pointers and using them to compare with the current boot's text and data pointers, just save off the KASLR offset. Then that can be used to figure out how to read the previous boots buffer. The last_boot_info will now show this offset, but only if it is for a previous boot: ~# cat instances/boot_mapped/last_boot_info 39000000 [kernel] ~# echo function > instances/boot_mapped/current_tracer ~# cat instances/boot_mapped/last_boot_info # Current If the KASLR offset saved is for the current boot, the last_boot_info will show the value of "current". Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164608.274956504@goodmis.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 17fbb7855295..8de035f4f0d9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -94,8 +94,7 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag unsigned long range_size, struct lock_class_key *key); -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, - long *data); +bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kaslr_addr); /* * Because the ring buffer is generic, if other users of the ring buffer get -- cgit v1.2.3 From 4af0a9c518522892b36cb7ecedf0c6004dc0a581 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:42 -0500 Subject: ring-buffer: Add ring_buffer_meta_scratch() Now that there's one meta data at the start of the persistent memory used by the ring buffer, allow the caller to request some memory right after that data that it can use as its own persistent memory. Also fix some white space issues with ring_buffer_alloc(). Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164608.619631731@goodmis.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 8de035f4f0d9..019b59a0bbc9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -92,9 +92,11 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, + unsigned long scratch_size, struct lock_class_key *key); bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kaslr_addr); +void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size); /* * Because the ring buffer is generic, if other users of the ring buffer get @@ -112,11 +114,11 @@ bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kas * traced by ftrace, it can produce lockdep warnings. We need to keep each * ring buffer's lock class separate. */ -#define ring_buffer_alloc_range(size, flags, order, start, range_size) \ +#define ring_buffer_alloc_range(size, flags, order, start, range_size, s_size) \ ({ \ static struct lock_class_key __key; \ __ring_buffer_alloc_range((size), (flags), (order), (start), \ - (range_size), &__key); \ + (range_size), (s_size), &__key); \ }) typedef bool (*ring_buffer_cond_fn)(void *data); -- cgit v1.2.3 From b65334825fb14cae15e93d627c3cfc2986c7eea6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:43 -0500 Subject: tracing: Have persistent trace instances save KASLR offset There's no reason to save the KASLR offset for the ring buffer itself. That is used by the tracer. Now that the tracer has a way to save data in the persistent memory of the ring buffer, have the tracing infrastructure take care of the saving of the KASLR offset. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250305164608.792722274@goodmis.org Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 019b59a0bbc9..56e27263acf8 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -95,7 +95,6 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag unsigned long scratch_size, struct lock_class_key *key); -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, unsigned long *kaslr_addr); void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size); /* -- cgit v1.2.3 From 966b7d0e524da03a18ba1b111d0fa0d81e840f77 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2025 11:45:44 -0500 Subject: module: Add module_for_each_mod() function The tracing system needs a way to save all the currently loaded modules and their addresses into persistent memory so that it can evaluate the addresses on a reboot from a crash. When the persistent memory trace starts, it will load the module addresses and names into the persistent memory. To do so, it will call the module_for_each_mod() function and pass it a function and data structure to get called on each loaded module. Then it can record the memory. This only implements that function. Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Sami Tolvanen Cc: Daniel Gomez Cc: linux-modules@vger.kernel.org Link: https://lore.kernel.org/20250305164608.962615966@goodmis.org Acked-by: Petr Pavlu Signed-off-by: Steven Rostedt (Google) --- include/linux/module.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 30e5b19bafa9..9a71dd2cb11f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -782,6 +782,8 @@ static inline void *module_writable_address(struct module *mod, void *loc) return __module_writable_address(mod, loc); } +void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data); + #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -894,6 +896,10 @@ static inline void *module_writable_address(struct module *mod, void *loc) { return loc; } + +static inline void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data) +{ +} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 74e2498ccf7b303e7fdd881f58a849e884afb486 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 19 Feb 2025 00:08:58 +0900 Subject: mm/memblock: Add reserved memory release function Add reserve_mem_release_by_name() to release a reserved memory region with a given name. This allows us to release reserved memory which is defined by kernel cmdline, after boot. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Mike Rapoport (Microsoft) Cc: Andrew Morton Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: linux-mm@kvack.org Link: https://lore.kernel.org/173989133862.230693.14094993331347437600.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b1068ddcbb7..1ee9e7447485 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4123,6 +4123,7 @@ void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); +int reserve_mem_release_by_name(const char *name); #ifdef CONFIG_64BIT int do_mseal(unsigned long start, size_t len_in, unsigned long flags); -- cgit v1.2.3 From 803f97298e7de9242eb677a1351dcafbbcc9117e Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:42 -0700 Subject: iommufd: Extend IOMMU_GET_HW_INFO to report PASID capability PASID usage requires PASID support in both device and IOMMU. Since the iommu drivers always enable the PASID capability for the device if it is supported, this extends the IOMMU_GET_HW_INFO to report the PASID capability to userspace. Also, enhances the selftest accordingly. Link: https://patch.msgid.link/r/20250321180143.8468-5-yi.l.liu@intel.com Cc: Bjorn Helgaas Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao #aarch64 platform Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/pci-ats.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index 0e8b74e63767..75c6c86cf09d 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -42,6 +42,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features); void pci_disable_pasid(struct pci_dev *pdev); int pci_pasid_features(struct pci_dev *pdev); int pci_max_pasids(struct pci_dev *pdev); +int pci_pasid_status(struct pci_dev *pdev); #else /* CONFIG_PCI_PASID */ static inline int pci_enable_pasid(struct pci_dev *pdev, int features) { return -EINVAL; } @@ -50,6 +51,8 @@ static inline int pci_pasid_features(struct pci_dev *pdev) { return -EINVAL; } static inline int pci_max_pasids(struct pci_dev *pdev) { return -EINVAL; } +static inline int pci_pasid_status(struct pci_dev *pdev) +{ return -EINVAL; } #endif /* CONFIG_PCI_PASID */ #endif /* LINUX_PCI_ATS_H */ -- cgit v1.2.3 From 112e43e9fd3b999513b1914e2bf523ae509f4c7d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 28 Mar 2025 11:22:54 -0700 Subject: Revert "Merge tag 'irq-msi-2025-03-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip" This reverts commit 36f5f026df6c1cd8a20373adc4388d2b3401ce91, reversing changes made to 43a7eec035a5b64546c8adefdc9cf96a116da14b. Thomas says: "I just noticed that for some incomprehensible reason, probably sheer incompetemce when trying to utilize b4, I managed to merge an outdated _and_ buggy version of that series. Can you please revert that merge completely?" Done. Requested-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- include/linux/cleanup.h | 17 ----------------- include/linux/irqdomain.h | 2 -- include/linux/msi.h | 12 +++++++----- 3 files changed, 7 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2b32a5759b22..7e57047e1564 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -216,23 +216,6 @@ const volatile void * __must_check_fn(const volatile void *val) #define return_ptr(p) return no_free_ptr(p) -/* - * Only for situations where an allocation is handed in to another function - * and consumed by that function on success. - * - * struct foo *f __free(kfree) = kzalloc(sizeof(*f), GFP_KERNEL); - * - * setup(f); - * if (some_condition) - * return -EINVAL; - * .... - * ret = bar(f); - * if (!ret) - * retain_ptr(f); - * return ret; - */ -#define retain_ptr(p) \ - __get_and_null(p, NULL) /* * DEFINE_CLASS(name, type, exit, init, init_args...): diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 5126482515cb..33ff41eef8f7 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -281,8 +281,6 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) void irq_domain_free_fwnode(struct fwnode_handle *fwnode); -DEFINE_FREE(irq_domain_free_fwnode, struct fwnode_handle *, if (_T) irq_domain_free_fwnode(_T)) - struct irq_domain_chip_generic_info; /** diff --git a/include/linux/msi.h b/include/linux/msi.h index e71c991d7f61..86e42742fd0f 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -80,6 +80,7 @@ struct device_attribute; struct irq_domain; struct irq_affinity_desc; +void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); #else @@ -228,11 +229,8 @@ struct msi_dev_domain { int msi_setup_device_data(struct device *dev); -void __msi_lock_descs(struct device *dev); -void __msi_unlock_descs(struct device *dev); - -DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, __msi_lock_descs(_T->lock), - __msi_unlock_descs(_T->lock)); +void msi_lock_descs(struct device *dev); +void msi_unlock_descs(struct device *dev); struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); @@ -636,6 +634,8 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid); bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, enum irq_domain_bus_token bus_token); +int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last); int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs); @@ -644,6 +644,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *cookie); +void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last); void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid); -- cgit v1.2.3 From a30d4ff8193ef768dbb524824c7aa07c5486a63a Mon Sep 17 00:00:00 2001 From: Nir Lichtman Date: Tue, 4 Feb 2025 05:47:41 +0000 Subject: kdb: remove usage of static environment buffer Problem: The set environment variable logic uses a static "heap" like buffer to store the values of the variables, and they are never freed, on top of that this is redundant since the kernel supplies allocation facilities which are even used also in this file. Solution: Remove the weird static buffer logic and use kmalloc instead, call kfree when overriding an existing variable. Signed-off-by: Nir Lichtman Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20250204054741.GB1219827@lichtman.org Signed-off-by: Daniel Thompson --- include/linux/kdb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 905a2e2f45f6..ecbf819deeca 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -104,7 +104,7 @@ extern int kdb_initial_cpu; #define KDB_NOENVVALUE (-6) #define KDB_NOTIMP (-7) #define KDB_ENVFULL (-8) -#define KDB_ENVBUFFULL (-9) +#define KDB_KMALLOCFAILED (-9) #define KDB_TOOMANYBPT (-10) #define KDB_TOOMANYDBREGS (-11) #define KDB_DUPBPT (-12) -- cgit v1.2.3 From 296e16961817e8e5f574661febc608ac0c0c0108 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 08:55:47 +0100 Subject: io_uring: hide caches sqes from drivers There is now an io_uring private part of cmd async_data, move saved sqe into it. Drivers are accessing it via struct io_uring_cmd::cmd. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ecbe078dd57acefdbc4366d083327086c0879378.1743357121.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index e6723fa95160..0634a3de1782 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -21,7 +21,6 @@ struct io_uring_cmd { struct io_uring_cmd_data { void *op_data; - struct io_uring_sqe sqes[2]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -- cgit v1.2.3 From 09f37f2d7b21ff35b8b533f9ab8cfad2fe8f72f6 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 31 Mar 2025 21:26:44 -0700 Subject: sched/smt: Always inline sched_smt_active() sched_smt_active() can be called from noinstr code, so it should always be inlined. The CONFIG_SCHED_SMT version already has __always_inline. Do the same for its !CONFIG_SCHED_SMT counterpart. Fixes the following warning: vmlinux.o: error: objtool: intel_idle_ibrs+0x13: call to sched_smt_active() leaves .noinstr.text section Fixes: 321a874a7ef8 ("sched/smt: Expose sched_smt_present static key") Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/1d03907b0a247cf7fb5c1d518de378864f603060.1743481539.git.jpoimboe@kernel.org Closes: https://lore.kernel.org/r/202503311434.lyw2Tveh-lkp@intel.com/ --- include/linux/sched/smt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index fb1e295e7e63..166b19af956f 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -12,7 +12,7 @@ static __always_inline bool sched_smt_active(void) return static_branch_likely(&sched_smt_present); } #else -static inline bool sched_smt_active(void) { return false; } +static __always_inline bool sched_smt_active(void) { return false; } #endif void arch_smt_update(void); -- cgit v1.2.3 From 9ac50f7311dc8b39e355582f14c1e82da47a8196 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 31 Mar 2025 21:26:45 -0700 Subject: context_tracking: Always inline ct_{nmi,irq}_{enter,exit}() Thanks to CONFIG_DEBUG_SECTION_MISMATCH, empty functions can be generated out of line. These can be called from noinstr code, so make sure they're always inlined. Fixes the following warnings: vmlinux.o: warning: objtool: irqentry_nmi_enter+0xa2: call to ct_nmi_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_nmi_exit+0x16: call to ct_nmi_exit() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_exit+0x78: call to ct_irq_exit() leaves .noinstr.text section Fixes: 6f0e6c1598b1 ("context_tracking: Take IRQ eqs entrypoints over RCU") Reported-by: Randy Dunlap Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Paul E. McKenney Cc: Linus Torvalds Link: https://lore.kernel.org/r/8509bce3f536bcd4ae7af3a2cf6930d48c5e631a.1743481539.git.jpoimboe@kernel.org Closes: https://lore.kernel.org/d1eca076-fdde-484a-b33e-70e0d167c36d@infradead.org --- include/linux/context_tracking_irq.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_irq.h b/include/linux/context_tracking_irq.h index c50b5670c4a5..197916ee91a4 100644 --- a/include/linux/context_tracking_irq.h +++ b/include/linux/context_tracking_irq.h @@ -10,12 +10,12 @@ void ct_irq_exit_irqson(void); void ct_nmi_enter(void); void ct_nmi_exit(void); #else -static inline void ct_irq_enter(void) { } -static inline void ct_irq_exit(void) { } +static __always_inline void ct_irq_enter(void) { } +static __always_inline void ct_irq_exit(void) { } static inline void ct_irq_enter_irqson(void) { } static inline void ct_irq_exit_irqson(void) { } -static inline void ct_nmi_enter(void) { } -static inline void ct_nmi_exit(void) { } +static __always_inline void ct_nmi_enter(void) { } +static __always_inline void ct_nmi_exit(void) { } #endif #endif -- cgit v1.2.3 From 6309a5c43b0dc629851f25b2e5ef8beff61d08e5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 31 Mar 2025 21:26:46 -0700 Subject: rcu-tasks: Always inline rcu_irq_work_resched() Thanks to CONFIG_DEBUG_SECTION_MISMATCH, empty functions can be generated out of line. rcu_irq_work_resched() can be called from noinstr code, so make sure it's always inlined. Fixes: 564506495ca9 ("rcu/context-tracking: Move deferred nocb resched to context tracking") Reported-by: Randy Dunlap Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Frederic Weisbecker Cc: Paul E. McKenney Cc: Linus Torvalds Link: https://lore.kernel.org/r/e84f15f013c07e4c410d972e75620c53b62c1b3e.1743481539.git.jpoimboe@kernel.org Closes: https://lore.kernel.org/d1eca076-fdde-484a-b33e-70e0d167c36d@infradead.org --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f8159f8a7d73..120536f4c6eb 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -132,7 +132,7 @@ static inline void rcu_sysrq_end(void) { } #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) void rcu_irq_work_resched(void); #else -static inline void rcu_irq_work_resched(void) { } +static __always_inline void rcu_irq_work_resched(void) { } #endif #ifdef CONFIG_RCU_NOCB_CPU -- cgit v1.2.3 From c2004b6efb1c891b80bef186771a3668fc25f6b3 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Mon, 3 Mar 2025 23:36:37 +0100 Subject: rtc: mt6397: drop unused defines RTC_NUM_YEARS has never been used, the other defines are not used since commit 34bbdc12d04e ("rtc: mt6359: Add RTC hardware range and add support for start-year") Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250303223637.1135362-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- include/linux/mfd/mt6397/rtc.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/mt6397/rtc.h b/include/linux/mfd/mt6397/rtc.h index 068ae1c0f0e8..27883af44f87 100644 --- a/include/linux/mfd/mt6397/rtc.h +++ b/include/linux/mfd/mt6397/rtc.h @@ -60,11 +60,6 @@ #define RTC_PDN2 0x002e #define RTC_PDN2_PWRON_ALARM BIT(4) -#define RTC_MIN_YEAR 1968 -#define RTC_BASE_YEAR 1900 -#define RTC_NUM_YEARS 128 -#define RTC_MIN_YEAR_OFFSET (RTC_MIN_YEAR - RTC_BASE_YEAR) - #define MTK_RTC_POLL_DELAY_US 10 #define MTK_RTC_POLL_TIMEOUT (jiffies_to_usecs(HZ)) -- cgit v1.2.3 From b98072af60a7ce19214c80f10a7daab924c69182 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 8 Jan 2025 00:48:21 -0700 Subject: mm/hugetlb_vmemmap: fix memory loads ordering Using x86_64 as an example, for a 32KB struct page[] area describing a 2MB hugeTLB, HVO reduces the area to 4KB by the following steps: 1. Split the (r/w vmemmap) PMD mapping the area into 512 (r/w) PTEs; 2. For the 8 PTEs mapping the area, remap PTE 1-7 to the page mapped by PTE 0, and at the same time change the permission from r/w to r/o; 3. Free the pages PTE 1-7 used to map, hence the reduction from 32KB to 4KB. However, the following race can happen due to improperly memory loads ordering: CPU 1 (HVO) CPU 2 (speculative PFN walker) page_ref_freeze() synchronize_rcu() rcu_read_lock() page_is_fake_head() is false vmemmap_remap_pte() XXX: struct page[] becomes r/o page_ref_unfreeze() page_ref_count() is not zero atomic_add_unless(&page->_refcount) XXX: try to modify r/o struct page[] Specifically, page_is_fake_head() must be ordered after page_ref_count() on CPU 2 so that it can only return true for this case, to avoid the later attempt to modify r/o struct page[]. This patch adds the missing memory barrier and makes the tests on page_is_fake_head() and page_ref_count() done in the proper order. Link: https://lkml.kernel.org/r/20250108074822.722696-1-yuzhao@google.com Fixes: bd225530a4c7 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers") Signed-off-by: Yu Zhao Reported-by: Will Deacon Closes: https://lore.kernel.org/20241128142028.GA3506@willie-the-truck/ Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Acked-by: Will Deacon Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 37 +++++++++++++++++++++++++++++++++++++ include/linux/page_ref.h | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5bd9492a66ee..e6a21b62dcce 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -226,11 +226,48 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page } return page; } + +static __always_inline bool page_count_writable(const struct page *page, int u) +{ + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) + return true; + + /* + * The refcount check is ordered before the fake-head check to prevent + * the following race: + * CPU 1 (HVO) CPU 2 (speculative PFN walker) + * + * page_ref_freeze() + * synchronize_rcu() + * rcu_read_lock() + * page_is_fake_head() is false + * vmemmap_remap_pte() + * XXX: struct page[] becomes r/o + * + * page_ref_unfreeze() + * page_ref_count() is not zero + * + * atomic_add_unless(&page->_refcount) + * XXX: try to modify r/o struct page[] + * + * The refcount check also prevents modification attempts to other (r/o) + * tail pages that are not fake heads. + */ + if (atomic_read_acquire(&page->_refcount) == u) + return false; + + return page_fixed_fake_head(page) == page; +} #else static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } + +static inline bool page_count_writable(const struct page *page, int u) +{ + return true; +} #endif static __always_inline int page_is_fake_head(const struct page *page) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 8c236c651d1d..544150d1d5fd 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -234,7 +234,7 @@ static inline bool page_ref_add_unless(struct page *page, int nr, int u) rcu_read_lock(); /* avoid writing to the vmemmap area being remapped */ - if (!page_is_fake_head(page) && page_ref_count(page) != u) + if (page_count_writable(page, u)) ret = atomic_add_unless(&page->_refcount, nr, u); rcu_read_unlock(); -- cgit v1.2.3 From 5796d3967c0956734bd1249f76989ca80da0225b Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Wed, 5 Mar 2025 02:17:05 +0000 Subject: mseal sysmap: kernel config and header change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mseal system mappings", v9. As discussed during mseal() upstream process [1], mseal() protects the VMAs of a given virtual memory range against modifications, such as the read/write (RW) and no-execute (NX) bits. For complete descriptions of memory sealing, please see mseal.rst [2]. The mseal() is useful to mitigate memory corruption issues where a corrupted pointer is passed to a memory management system. For example, such an attacker primitive can break control-flow integrity guarantees since read-only memory that is supposed to be trusted can become writable or .text pages can get remapped. The system mappings are readonly only, memory sealing can protect them from ever changing to writable or unmmap/remapped as different attributes. System mappings such as vdso, vvar, vvar_vclock, vectors (arm compat-mode), sigpage (arm compat-mode), are created by the kernel during program initialization, and could be sealed after creation. Unlike the aforementioned mappings, the uprobe mapping is not established during program startup. However, its lifetime is the same as the process's lifetime [3]. It could be sealed from creation. The vsyscall on x86-64 uses a special address (0xffffffffff600000), which is outside the mm managed range. This means mprotect, munmap, and mremap won't work on the vsyscall. Since sealing doesn't enhance the vsyscall's security, it is skipped in this patch. If we ever seal the vsyscall, it is probably only for decorative purpose, i.e. showing the 'sl' flag in the /proc/pid/smaps. For this patch, it is ignored. It is important to note that the CHECKPOINT_RESTORE feature (CRIU) may alter the system mappings during restore operations. UML(User Mode Linux) and gVisor, rr are also known to change the vdso/vvar mappings. Consequently, this feature cannot be universally enabled across all systems. As such, CONFIG_MSEAL_SYSTEM_MAPPINGS is disabled by default. To support mseal of system mappings, architectures must define CONFIG_ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS and update their special mappings calls to pass mseal flag. Additionally, architectures must confirm they do not unmap/remap system mappings during the process lifetime. The existence of this flag for an architecture implies that it does not require the remapping of thest system mappings during process lifetime, so sealing these mappings is safe from a kernel perspective. This version covers x86-64 and arm64 archiecture as minimum viable feature. While no specific CPU hardware features are required for enable this feature on an archiecture, memory sealing requires a 64-bit kernel. Other architectures can choose whether or not to adopt this feature. Currently, I'm not aware of any instances in the kernel code that actively munmap/mremap a system mapping without a request from userspace. The PPC does call munmap when _install_special_mapping fails for vdso; however, it's uncertain if this will ever fail for PPC - this needs to be investigated by PPC in the future [4]. The UML kernel can add this support when KUnit tests require it [5]. In this version, we've improved the handling of system mapping sealing from previous versions, instead of modifying the _install_special_mapping function itself, which would affect all architectures, we now call _install_special_mapping with a sealing flag only within the specific architecture that requires it. This targeted approach offers two key advantages: 1) It limits the code change's impact to the necessary architectures, and 2) It aligns with the software architecture by keeping the core memory management within the mm layer, while delegating the decision of sealing system mappings to the individual architecture, which is particularly relevant since 32-bit architectures never require sealing. Prior to this patch series, we explored sealing special mappings from userspace using glibc's dynamic linker. This approach revealed several issues: - The PT_LOAD header may report an incorrect length for vdso, (smaller than its actual size). The dynamic linker, which relies on PT_LOAD information to determine mapping size, would then split and partially seal the vdso mapping. Since each architecture has its own vdso/vvar code, fixing this in the kernel would require going through each archiecture. Our initial goal was to enable sealing readonly mappings, e.g. .text, across all architectures, sealing vdso from kernel since creation appears to be simpler than sealing vdso at glibc. - The [vvar] mapping header only contains address information, not length information. Similar issues might exist for other special mappings. - Mappings like uprobe are not covered by the dynamic linker, and there is no effective solution for them. This feature's security enhancements will benefit ChromeOS, Android, and other high security systems. Testing: This feature was tested on ChromeOS and Android for both x86-64 and ARM64. - Enable sealing and verify vdso/vvar, sigpage, vector are sealed properly, i.e. "sl" shown in the smaps for those mappings, and mremap is blocked. - Passing various automation tests (e.g. pre-checkin) on ChromeOS and Android to ensure the sealing doesn't affect the functionality of Chromebook and Android phone. I also tested the feature on Ubuntu on x86-64: - With config disabled, vdso/vvar is not sealed, - with config enabled, vdso/vvar is sealed, and booting up Ubuntu is OK, normal operations such as browsing the web, open/edit doc are OK. Link: https://lore.kernel.org/all/20240415163527.626541-1-jeffxu@chromium.org/ [1] Link: Documentation/userspace-api/mseal.rst [2] Link: https://lore.kernel.org/all/CABi2SkU9BRUnqf70-nksuMCQ+yyiWjo3fM4XkRkL-NrCZxYAyg@mail.gmail.com/ [3] Link: https://lore.kernel.org/all/CABi2SkV6JJwJeviDLsq9N4ONvQ=EFANsiWkgiEOjyT9TQSt+HA@mail.gmail.com/ [4] Link: https://lore.kernel.org/all/202502251035.239B85A93@keescook/ [5] This patch (of 7): Provide infrastructure to mseal system mappings. Establish two kernel configs (CONFIG_MSEAL_SYSTEM_MAPPINGS, ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS) and VM_SEALED_SYSMAP macro for future patches. Link: https://lkml.kernel.org/r/20250305021711.3867874-1-jeffxu@google.com Link: https://lkml.kernel.org/r/20250305021711.3867874-2-jeffxu@google.com Signed-off-by: Jeff Xu Reviewed-by: Kees Cook Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Cc: Adhemerval Zanella Cc: Alexander Mikhalitsyn Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Anna-Maria Behnsen Cc: Ard Biesheuvel Cc: Benjamin Berg Cc: Christoph Hellwig Cc: Dave Hansen Cc: David Rientjes Cc: David S. Miller Cc: Elliot Hughes Cc: Florian Faineli Cc: Greg Ungerer Cc: Guenter Roeck Cc: Heiko Carstens Cc: Helge Deller Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Ingo Molnar Cc: Jann Horn Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Jorge Lucangeli Obes Cc: Linus Waleij Cc: Mark Rutland Cc: Matthew Wilcow (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Oleg Nesterov Cc: Pedro Falcato Cc: Peter Xu Cc: Randy Dunlap Cc: Stephen Röttger Cc: Thomas Weißschuh Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 32ba0e33422b..778f5de6a12e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4236,4 +4236,14 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); + +/* + * mseal of userspace process's system mappings. + */ +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED +#else +#define VM_SEALED_SYSMAP VM_NONE +#endif + #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From e2a33a2a3258794891cdd6ca4b1318da6594d157 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Mar 2025 11:26:06 -0400 Subject: lib/sort.c: add _nonatomic() variants with cond_resched() bcachefs calls sort() during recovery to sort all keys it found in the journal, and this may be very large - gigabytes on large machines. This has been causing "task blocked" warnings, so needs a cond_resched(). [kent.overstreet@linux.dev: fix kerneldoc] Link: https://lkml.kernel.org/r/cgsr5a447pxqomc4gvznsp5yroqmif4omd7o5lsr2swifjhoic@yzjjrx2bvrq7 Link: https://lkml.kernel.org/r/20250326152606.2594920-1-kent.overstreet@linux.dev Signed-off-by: Kent Overstreet Cc: Kuan-Wei Chiu Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/sort.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sort.h b/include/linux/sort.h index e163287ac6c1..8e5603b10941 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -13,4 +13,15 @@ void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func); +/* Versions that periodically call cond_resched(): */ + +void sort_r_nonatomic(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv); + +void sort_nonatomic(void *base, size_t num, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func); + #endif -- cgit v1.2.3 From 149974fdb8e186a1c72b87cee806373624a8a375 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Mar 2025 21:51:51 +0800 Subject: block: add for_each_mp_bvec() Add helper of for_each_mp_bvec() for io_uring to import fixed kernel buffer. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250325135155.935398-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/bvec.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ba8f52d48b94..204b22a99c4b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -184,6 +184,12 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv, ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) +#define for_each_mp_bvec(bvl, bio_vec, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \ + bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) + /* for iterating one bio from start to end */ #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ { \ -- cgit v1.2.3 From 4c975fd700022c90e61a46326e3444e08317876e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 1 Apr 2025 09:34:43 -0700 Subject: net: hold instance lock during NETDEV_REGISTER/UP Callers of inetdev_init can come from several places with inconsistent expectation about netdev instance lock. Grab instance lock during REGISTER (plus UP). Also solve the inconsistency with UNREGISTER where it was locked only during move netns path. WARNING: CPU: 10 PID: 1479 at ./include/net/netdev_lock.h:54 __netdev_update_features+0x65f/0xca0 __warn+0x81/0x180 __netdev_update_features+0x65f/0xca0 report_bug+0x156/0x180 handle_bug+0x4f/0x90 exc_invalid_op+0x13/0x60 asm_exc_invalid_op+0x16/0x20 __netdev_update_features+0x65f/0xca0 netif_disable_lro+0x30/0x1d0 inetdev_init+0x12f/0x1f0 inetdev_event+0x48b/0x870 notifier_call_chain+0x38/0xf0 register_netdevice+0x741/0x8b0 register_netdev+0x1f/0x40 mlx5e_probe+0x4e3/0x8e0 [mlx5_core] auxiliary_bus_probe+0x3f/0x90 really_probe+0xc3/0x3a0 __driver_probe_device+0x80/0x150 driver_probe_device+0x1f/0x90 __device_attach_driver+0x7d/0x100 bus_for_each_drv+0x80/0xd0 __device_attach+0xb4/0x1c0 bus_probe_device+0x91/0xa0 device_add+0x657/0x870 Reviewed-by: Jakub Kicinski Reported-by: Cosmin Ratiu Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250401163452.622454-3-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fa79145518d1..cf3b6445817b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4192,7 +4192,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, int netif_set_alias(struct net_device *dev, const char *alias, size_t len); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); -int netif_change_net_namespace(struct net_device *dev, struct net *net, +int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex, struct netlink_ext_ack *extack); int dev_change_net_namespace(struct net_device *dev, struct net *net, -- cgit v1.2.3 From 825dfab23bca520629a9e5a21ba5b03aaccc75f2 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:28:55 +0100 Subject: irqdomain: Rename irq_set_default_host() to irq_set_default_domain() Naming interrupt domains host is confusing at best and the irqdomain code uses both domain and host inconsistently. Therefore rename irq_set_default_host() to irq_set_default_domain(). Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-3-jirislaby@kernel.org --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 33ff41eef8f7..4b5c495b5710 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -352,7 +352,7 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, void *host_data); struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); -void irq_set_default_host(struct irq_domain *host); +void irq_set_default_domain(struct irq_domain *domain); struct irq_domain *irq_get_default_host(void); int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node, -- cgit v1.2.3 From 0a27ea384c82e70d16e40adbaebeb3725f7e6342 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:28:56 +0100 Subject: irqdomain: Rename irq_get_default_host() to irq_get_default_domain() Naming interrupt domains host is confusing at best and the irqdomain code uses both domain and host inconsistently. Therefore rename irq_get_default_host() to irq_get_default_domain(). Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-4-jirislaby@kernel.org --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 4b5c495b5710..e9ab95fbc5a9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -353,7 +353,7 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); void irq_set_default_domain(struct irq_domain *domain); -struct irq_domain *irq_get_default_host(void); +struct irq_domain *irq_get_default_domain(void); int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity); -- cgit v1.2.3 From d2705d33885e3a19f727dff2521fb7d5b1fc5cda Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:28:57 +0100 Subject: irqdomain: Stop using 'host' for domain It is confusing to see 'host' and 'domain' to be used as 'domain'. Given this header is all about domains, switch the remaining 'host' uses to 'domain'. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-5-jirislaby@kernel.org --- include/linux/irqdomain.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e9ab95fbc5a9..bb7111105296 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -72,7 +72,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, /** * struct irq_domain_ops - Methods for irq_domain objects - * @match: Match an interrupt controller device node to a host, returns + * @match: Match an interrupt controller device node to a domain, returns * 1 on a match * @select: Match an interrupt controller fw specification. It is more generic * than @match as it receives a complete struct irq_fwspec. Therefore, @@ -454,7 +454,7 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod return IS_ERR(d) ? NULL : d; } -unsigned int irq_create_direct_mapping(struct irq_domain *host); +unsigned int irq_create_direct_mapping(struct irq_domain *domain); #endif static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, @@ -507,7 +507,7 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw return IS_ERR(d) ? NULL : d; } -void irq_domain_remove(struct irq_domain *host); +void irq_domain_remove(struct irq_domain *domain); int irq_domain_associate(struct irq_domain *domain, unsigned int irq, irq_hw_number_t hwirq); @@ -515,16 +515,16 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count); -unsigned int irq_create_mapping_affinity(struct irq_domain *host, +unsigned int irq_create_mapping_affinity(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity); unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); void irq_dispose_mapping(unsigned int virq); -static inline unsigned int irq_create_mapping(struct irq_domain *host, +static inline unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - return irq_create_mapping_affinity(host, hwirq, NULL); + return irq_create_mapping_affinity(domain, hwirq, NULL); } struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, -- cgit v1.2.3 From 8fa7292fee5c5240402371ea89ab285ec856c916 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Apr 2025 10:17:26 +0200 Subject: treewide: Switch/rename to timer_delete[_sync]() timer_delete[_sync]() replaces del_timer[_sync](). Convert the whole tree over and remove the historical wrapper inlines. Conversion was done with coccinelle plus manual fixups where necessary. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/timer.h | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index e67ecd1cbc97..10596d7c3a34 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -30,7 +30,7 @@ * * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and * it's safe to wait for the completion of the running instance from - * IRQ handlers, for example, by calling del_timer_sync(). + * IRQ handlers, for example, by calling timer_delete_sync(). * * Note: The irq disabled callback execution is a special case for * workqueue locking issues. It's not meant for executing random crap @@ -168,40 +168,6 @@ extern int timer_delete(struct timer_list *timer); extern int timer_shutdown_sync(struct timer_list *timer); extern int timer_shutdown(struct timer_list *timer); -/** - * del_timer_sync - Delete a pending timer and wait for a running callback - * @timer: The timer to be deleted - * - * See timer_delete_sync() for detailed explanation. - * - * Do not use in new code. Use timer_delete_sync() instead. - * - * Returns: - * * %0 - The timer was not pending - * * %1 - The timer was pending and deactivated - */ -static inline int del_timer_sync(struct timer_list *timer) -{ - return timer_delete_sync(timer); -} - -/** - * del_timer - Delete a pending timer - * @timer: The timer to be deleted - * - * See timer_delete() for detailed explanation. - * - * Do not use in new code. Use timer_delete() instead. - * - * Returns: - * * %0 - The timer was not pending - * * %1 - The timer was pending and deactivated - */ -static inline int del_timer(struct timer_list *timer) -{ - return timer_delete(timer); -} - extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); -- cgit v1.2.3 From 9779489a31d77a7b9cb6f20d2d2caced4e29dbe6 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:10 +0100 Subject: hrtimers: Delete hrtimer_init() hrtimer_init() is now unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/003722f60c7a2a4f8d4ed24fb741aa313b7e5136.1738746927.git.namcao@linutronix.de --- include/linux/hrtimer.h | 2 -- include/linux/hrtimer_types.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 88e078871158..1adcba3ddd76 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -231,8 +231,6 @@ static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) /* Exported timer functions: */ /* Initialize timers: */ -extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, - enum hrtimer_mode mode); extern void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode); extern void hrtimer_setup_on_stack(struct hrtimer *timer, diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index ad66a3081735..7c5b27daa89d 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -34,7 +34,7 @@ enum hrtimer_restart { * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. * - * The hrtimer structure must be initialized by hrtimer_init() + * The hrtimer structure must be initialized by hrtimer_setup() */ struct hrtimer { struct timerqueue_node node; -- cgit v1.2.3 From 04257da0c99c9d4ff7c5bb93046482e1f7d34938 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 5 Feb 2025 11:55:16 +0100 Subject: hrtimers: Make callback function pointer private Make the struct hrtimer::function field private, to prevent users from changing this field in an unsafe way. hrtimer_update_function() should be used if the callback function needs to be changed. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/7d0e6e0c5c59a64a9bea940051aac05d750bc0c2.1738746927.git.namcao@linutronix.de --- include/linux/hrtimer_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 7c5b27daa89d..8fbbb6bdf7a1 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -39,7 +39,7 @@ enum hrtimer_restart { struct hrtimer { struct timerqueue_node node; ktime_t _softexpires; - enum hrtimer_restart (*function)(struct hrtimer *); + enum hrtimer_restart (*__private function)(struct hrtimer *); struct hrtimer_clock_base *base; u8 state; u8 is_rel; -- cgit v1.2.3