From 2b2d7ca7ce25fbec8389e7d85e57742caa47c97d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 10 Dec 2024 10:08:42 +0100 Subject: dma-buf: fix incorrect dma-fence documentation v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There isn't much worse than documentation giving an incorrect advise. Grabbing a spinlock while interrupts are disabled usually means that you must also disable interrupts for all other uses of this spinlock. Otherwise really hard to debug issues can occur. So fix that invalid documentation. v2: use Dmitry's suggestion on the documentation Signed-off-by: Christian König Reviewed-by: Simona Vetter (v1) Link: https://patchwork.freedesktop.org/patch/msgid/20250211163109.12200-2-christian.koenig@amd.com --- include/linux/dma-fence.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index e7ad819962e3..52587d390aca 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -169,8 +169,8 @@ struct dma_fence_ops { * implementation know that there is another driver waiting on the * signal (ie. hw->sw case). * - * This function can be called from atomic context, but not - * from irq context, so normal spinlocks can be used. + * This is called with irq's disabled, so only spinlocks which disable + * IRQ's can be used in the code outside of this callback. * * A return value of false indicates the fence already passed, * or some failure occurred that made it impossible to enable -- cgit v1.2.3 From 2ce07fea3cc8b866f7955a7ce1d62b0cc1f74819 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 18 Sep 2024 08:16:57 +0200 Subject: dma-buf/dma-fence: remove unnecessary callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fence_value_str and timeline_value_str callbacks were just an unnecessary abstraction in the SW sync implementation. The only caller of those callbacks already knew that the fence in questions is a timeline_fence. So print the values directly instead of using a redirection. Additional to that remove the implementations from virtgpu and vgem. As far as I can see those were never used in the first place. Signed-off-by: Christian König Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250211163109.12200-3-christian.koenig@amd.com --- include/linux/dma-fence.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 52587d390aca..b12776883d14 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -238,27 +238,6 @@ struct dma_fence_ops { */ void (*release)(struct dma_fence *fence); - /** - * @fence_value_str: - * - * Callback to fill in free-form debug info specific to this fence, like - * the sequence number. - * - * This callback is optional. - */ - void (*fence_value_str)(struct dma_fence *fence, char *str, int size); - - /** - * @timeline_value_str: - * - * Fills in the current value of the timeline as a string, like the - * sequence number. Note that the specific fence passed to this function - * should not matter, drivers should only use it to look up the - * corresponding timeline structures. - */ - void (*timeline_value_str)(struct dma_fence *fence, - char *str, int size); - /** * @set_deadline: * -- cgit v1.2.3 From de68b17d5d0716c9a02b8a6ffa34f47c8f2f7690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 11 Feb 2025 15:26:16 +0100 Subject: dma-buf: dma-buf: stop mapping sg_tables on attach v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a workaround to smoothly transit from static to dynamic DMA-buf handling we cached the sg_table on attach if dynamic handling mismatched between exporter and importer. Since Dmitry and Thomas cleaned that up and also documented the lock handling we can drop this workaround now. V2: implement Sima's comments Signed-off-by: Christian König Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250211163109.12200-4-christian.koenig@amd.com --- include/linux/dma-buf.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 36216d28d8bd..c54ff2dda8cb 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -583,20 +583,6 @@ static inline bool dma_buf_is_dynamic(struct dma_buf *dmabuf) return !!dmabuf->ops->pin; } -/** - * dma_buf_attachment_is_dynamic - check if a DMA-buf attachment uses dynamic - * mappings - * @attach: the DMA-buf attachment to check - * - * Returns true if a DMA-buf importer wants to call the map/unmap functions with - * the dma_resv lock held. - */ -static inline bool -dma_buf_attachment_is_dynamic(struct dma_buf_attachment *attach) -{ - return !!attach->importer_ops; -} - struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, struct device *dev); struct dma_buf_attachment * -- cgit v1.2.3 From b72f66f22c0e39ae6684c43fead774c13db24e73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 11 Feb 2025 17:20:53 +0100 Subject: dma-buf: drop caching of sg_tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That was purely for the transition from static to dynamic dma-buf handling and can be removed again now. Signed-off-by: Christian König Reviewed-by: Simona Vetter Reviewed-by: Dmitry Osipenko Link: https://patchwork.freedesktop.org/patch/msgid/20250211163109.12200-5-christian.koenig@amd.com --- include/linux/dma-buf.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index c54ff2dda8cb..544f8f8c3f44 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -34,15 +34,6 @@ struct dma_buf_attachment; * @vunmap: [optional] unmaps a vmap from the buffer */ struct dma_buf_ops { - /** - * @cache_sgt_mapping: - * - * If true the framework will cache the first mapping made for each - * attachment. This avoids creating mappings for attachments multiple - * times. - */ - bool cache_sgt_mapping; - /** * @attach: * @@ -493,8 +484,6 @@ struct dma_buf_attach_ops { * @dmabuf: buffer for this attachment. * @dev: device attached to the buffer. * @node: list of dma_buf_attachment, protected by dma_resv lock of the dmabuf. - * @sgt: cached mapping. - * @dir: direction of cached mapping. * @peer2peer: true if the importer can handle peer resources without pages. * @priv: exporter specific attachment data. * @importer_ops: importer operations for this attachment, if provided @@ -514,8 +503,6 @@ struct dma_buf_attachment { struct dma_buf *dmabuf; struct device *dev; struct list_head node; - struct sg_table *sgt; - enum dma_data_direction dir; bool peer2peer; const struct dma_buf_attach_ops *importer_ops; void *importer_priv; -- cgit v1.2.3 From 62b1fa69f31969e11d03529d3a80599ddda2d043 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:40 +1300 Subject: KVM: Export hardware virtualization enabling/disabling functions To support TDX, KVM will need to enable TDX during KVM module loading time. Enabling TDX requires enabling hardware virtualization first so that all online CPUs (and the new CPU going online) are in post-VMXON state. KVM by default enables hardware virtualization but that is done in kvm_init(), which must be the last step after all initialization is done thus is too late for enabling TDX. Export functions to enable/disable hardware virtualization so that TDX code can use them to handle hardware virtualization enabling before kvm_init(). Signed-off-by: Kai Huang Message-ID: Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f34f4cfaa513..1e75fa114f34 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2571,4 +2571,12 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +int kvm_enable_virtualization(void); +void kvm_disable_virtualization(void); +#else +static inline int kvm_enable_virtualization(void) { return 0; } +static inline void kvm_disable_virtualization(void) { } +#endif + #endif -- cgit v1.2.3 From fcdbdf63431c9faf639bffc957ea2ce9b545432e Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:41 +1300 Subject: KVM: VMX: Initialize TDX during KVM module load Before KVM can use TDX to create and run TDX guests, TDX needs to be initialized from two perspectives: 1) TDX module must be initialized properly to a working state; 2) A per-cpu TDX initialization, a.k.a the TDH.SYS.LP.INIT SEAMCALL must be done on any logical cpu before it can run any other TDX SEAMCALLs. The TDX host core-kernel provides two functions to do the above two respectively: tdx_enable() and tdx_cpu_enable(). There are two options in terms of when to initialize TDX: initialize TDX at KVM module loading time, or when creating the first TDX guest. Choose to initialize TDX during KVM module loading time: Initializing TDX module is both memory and CPU time consuming: 1) the kernel needs to allocate a non-trivial size(~1/256) of system memory as metadata used by TDX module to track each TDX-usable memory page's status; 2) the TDX module needs to initialize this metadata, one entry for each TDX-usable memory page. Also, the kernel uses alloc_contig_pages() to allocate those metadata chunks, because they are large and need to be physically contiguous. alloc_contig_pages() can fail. If initializing TDX when creating the first TDX guest, then there's chance that KVM won't be able to run any TDX guests albeit KVM _declares_ to be able to support TDX. This isn't good for the user. On the other hand, initializing TDX at KVM module loading time can make sure KVM is providing a consistent view of whether KVM can support TDX to the user. Always only try to initialize TDX after VMX has been initialized. TDX is based on VMX, and if VMX fails to initialize then TDX is likely to be broken anyway. Also, in practice, supporting TDX will require part of VMX and common x86 infrastructure in working order, so TDX cannot be enabled alone w/o VMX support. There are two cases that can result in failure to initialize TDX: 1) TDX cannot be supported (e.g., because of TDX is not supported or enabled by hardware, or module is not loaded, or missing some dependency in KVM's configuration); 2) Any unexpected error during TDX bring-up. For the first case only mark TDX is disabled but still allow KVM module to be loaded. For the second case just fail to load the KVM module so that the user can be aware. Because TDX costs additional memory, don't enable TDX by default. Add a new module parameter 'enable_tdx' to allow the user to opt-in. Note, the name tdx_init() has already been taken by the early boot code. Use tdx_bringup() for initializing TDX (and tdx_cleanup() since KVM doesn't actually teardown TDX). They don't match vt_init()/vt_exit(), vmx_init()/vmx_exit() etc but it's not end of the world. Also, once initialized, the TDX module cannot be disabled and enabled again w/o the TDX module runtime update, which isn't supported by the kernel. After TDX is enabled, nothing needs to be done when KVM disables hardware virtualization, e.g., when offlining CPU, or during suspend/resume. TDX host core-kernel code internally tracks TDX status and can handle "multiple enabling" scenario. Similar to KVM_AMD_SEV, add a new KVM_INTEL_TDX Kconfig to guide KVM TDX code. Make it depend on INTEL_TDX_HOST but not replace INTEL_TDX_HOST because in the longer term there's a use case that requires making SEAMCALLs w/o KVM as mentioned by Dan [1]. Link: https://lore.kernel.org/6723fc2070a96_60c3294dc@dwillia2-mobl3.amr.corp.intel.com.notmuch/ [1] Signed-off-by: Kai Huang Message-ID: <162f9dee05c729203b9ad6688db1ca2960b4b502.1731664295.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1e75fa114f34..3bfe3140f444 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2284,6 +2284,7 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +extern bool enable_virt_at_load; extern bool kvm_rebooting; #endif -- cgit v1.2.3 From 7c035bea94074b19ed560a4f23a76c5a6c8e594f Mon Sep 17 00:00:00 2001 From: Zhiming Hu Date: Wed, 19 Feb 2025 09:02:51 -0500 Subject: KVM: TDX: Register TDX host key IDs to cgroup misc controller TDX host key IDs (HKID) are limit resources in a machine, and the misc cgroup lets the machine owner track their usage and limits the possibility of abusing them outside the owner's control. The cgroup v2 miscellaneous subsystem was introduced to control the resource of AMD SEV & SEV-ES ASIDs. Likewise introduce HKIDs as a misc resource. Signed-off-by: Zhiming Hu Signed-off-by: Isaku Yamahata Signed-off-by: Paolo Bonzini --- include/linux/misc_cgroup.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 49eef10c8e59..8c0e4f4d71be 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -17,6 +17,10 @@ enum misc_res_type { MISC_CG_RES_SEV, /** @MISC_CG_RES_SEV_ES: AMD SEV-ES ASIDs resource */ MISC_CG_RES_SEV_ES, +#endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + MISC_CG_RES_TDX, #endif /** @MISC_CG_RES_TYPES: count of enum misc_res_type constants */ MISC_CG_RES_TYPES -- cgit v1.2.3 From c4a92f12cf35b83ce81757f6e5e8eb6223b87388 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 13 Jan 2025 11:08:41 +0800 Subject: KVM: Add parameter "kvm" to kvm_cpu_dirty_log_size() and its callers Add a parameter "kvm" to kvm_cpu_dirty_log_size() and down to its callers: kvm_dirty_ring_get_rsvd_entries(), kvm_dirty_ring_alloc(). This is a preparation to make cpu_dirty_log_size a per-VM value rather than a system-wide value. No function changes expected. Signed-off-by: Yan Zhao Signed-off-by: Paolo Bonzini --- include/linux/kvm_dirty_ring.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 4862c98d80d3..da4d9b5f58f1 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -32,7 +32,7 @@ struct kvm_dirty_ring { * If CONFIG_HAVE_HVM_DIRTY_RING not defined, kvm_dirty_ring.o should * not be included as well, so define these nop functions for the arch. */ -static inline u32 kvm_dirty_ring_get_rsvd_entries(void) +static inline u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { return 0; } @@ -42,7 +42,7 @@ static inline bool kvm_use_dirty_bitmap(struct kvm *kvm) return true; } -static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, +static inline int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, int index, u32 size) { return 0; @@ -71,11 +71,12 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ -int kvm_cpu_dirty_log_size(void); +int kvm_cpu_dirty_log_size(struct kvm *kvm); bool kvm_use_dirty_bitmap(struct kvm *kvm); bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); -u32 kvm_dirty_ring_get_rsvd_entries(void); -int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm); +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size); /* * called with kvm->slots_lock held, returns the number of -- cgit v1.2.3 From 44428e4936022a7a31743017849b167e64f33a32 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Sat, 22 Feb 2025 09:42:18 +0800 Subject: KVM: x86: Move pv_unhalted check out of kvm_vcpu_has_events() Move pv_unhalted check out of kvm_vcpu_has_events(), check pv_unhalted explicitly when handling PV unhalt and expose kvm_vcpu_has_events(). kvm_vcpu_has_events() returns true if pv_unhalted is set, and pv_unhalted is only cleared on transitions to KVM_MP_STATE_RUNNABLE. If the guest initiates a spurious wakeup, pv_unhalted could be left set in perpetuity. Currently, this is not problematic because kvm_vcpu_has_events() is only called when handling PV unhalt. However, if kvm_vcpu_has_events() is used for other purposes in the future, it could return the unexpected results. Export kvm_vcpu_has_events() for its usage in broader contexts. Suggested-by: Sean Christopherson Signed-off-by: Binbin Wu Message-ID: <20250222014225.897298-3-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3bfe3140f444..ed1968f6f841 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1609,6 +1609,7 @@ void kvm_arch_disable_virtualization(void); int kvm_arch_enable_virtualization_cpu(void); void kvm_arch_disable_virtualization_cpu(void); #endif +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 0b28b7080ef5a323c3afa3860c3d45d627629830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 4 Feb 2025 14:14:12 +0100 Subject: platform: cznic: Add keyctl helpers for Turris platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Turris devices support signing messages with a per-device unique asymmetric key that was created on the device at manufacture time. Add helper module that helps to expose this ability via the keyctl() syscall. A device-specific driver can register a signing key by calling devm_turris_signing_key_create(). Both the `.turris-signing-keys` keyring and the signing key are created with only the VIEW, READ and SEARCH permissions for userspace - it is impossible to link / unlink / move them, set their attributes, or unlink the keyring from userspace. Signed-off-by: Marek Behún Signed-off-by: Arnd Bergmann --- include/linux/turris-signing-key.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 include/linux/turris-signing-key.h (limited to 'include/linux') diff --git a/include/linux/turris-signing-key.h b/include/linux/turris-signing-key.h new file mode 100644 index 000000000000..032ca8cbf636 --- /dev/null +++ b/include/linux/turris-signing-key.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * 2025 by Marek Behún + */ + +#ifndef __TURRIS_SIGNING_KEY_H +#define __TURRIS_SIGNING_KEY_H + +#include +#include + +struct device; + +struct turris_signing_key_subtype { + u16 key_size; + u8 data_size; + u8 sig_size; + u8 public_key_size; + const char *hash_algo; + const void *(*get_public_key)(const struct key *key); + int (*sign)(const struct key *key, const void *msg, void *signature); +}; + +static inline struct device *turris_signing_key_get_dev(const struct key *key) +{ + return key->payload.data[1]; +} + +int +devm_turris_signing_key_create(struct device *dev, const struct turris_signing_key_subtype *subtype, + const char *desc); + +#endif /* __TURRIS_SIGNING_KEY_H */ -- cgit v1.2.3 From 78a0323506f01e8017a5826cd7e91951c13184fa Mon Sep 17 00:00:00 2001 From: Sohil Mehta Date: Thu, 27 Mar 2025 23:46:22 +0000 Subject: x86/nmi: Consolidate NMI panic variables Commit: c305a4e98378 ("x86: Move sysctls into arch/x86") recently moved the sysctl handling of panic_on_unrecovered_nmi and panic_on_io_nmi to x86-specific code. These variables no longer need to be declared in the generic header file. Relocate the variable definitions and declarations closer to where they are used. This makes all the NMI panic options consistent and easier to track. [ mingo: Fixed up the SHA1 of the commit reference. ] Signed-off-by: Sohil Mehta Signed-off-by: Ingo Molnar Reviewed-by: Kai Huang Acked-by: Peter Zijlstra (Intel) Reviewed-by: Nikolay Borisov Cc: Joel Granados Link: https://lore.kernel.org/r/20250327234629.3953536-3-sohil.mehta@intel.com --- include/linux/panic.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/panic.h b/include/linux/panic.h index 2494d51707ef..4adc65766935 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -20,8 +20,6 @@ extern bool panic_triggering_all_cpu_backtrace; extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; -extern int panic_on_unrecovered_nmi; -extern int panic_on_io_nmi; extern int panic_on_warn; extern unsigned long panic_on_taint; -- cgit v1.2.3 From 642989287350fa4964c37df0f3769094072421c3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 24 Mar 2025 17:59:59 +0100 Subject: firmware: turris-mox-rwtm: fix building without CONFIG_KEYS "struct key" is defined conditionally, so the code referencing it must be made conditional as well: In file included from drivers/firmware/turris-mox-rwtm.c:29: include/linux/turris-signing-key.h: In function 'turris_signing_key_get_dev': include/linux/turris-signing-key.h:26:19: error: invalid use of undefined type 'const struct key' 26 | return key->payload.data[1]; | ^~ Signed-off-by: Arnd Bergmann --- include/linux/turris-signing-key.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/turris-signing-key.h b/include/linux/turris-signing-key.h index 032ca8cbf636..8a435b73c3a9 100644 --- a/include/linux/turris-signing-key.h +++ b/include/linux/turris-signing-key.h @@ -11,6 +11,7 @@ struct device; +#ifdef CONFIG_KEYS struct turris_signing_key_subtype { u16 key_size; u8 data_size; @@ -29,5 +30,6 @@ static inline struct device *turris_signing_key_get_dev(const struct key *key) int devm_turris_signing_key_create(struct device *dev, const struct turris_signing_key_subtype *subtype, const char *desc); +#endif #endif /* __TURRIS_SIGNING_KEY_H */ -- cgit v1.2.3 From f6e9a26e2d488c743757d66898ae91c53ffbe528 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 3 Apr 2025 18:10:46 -0700 Subject: cgroup: move rstat base stat objects into their own struct This non-functional change serves as preparation for moving to subsystem-based rstat trees. The base stats are not an actual subsystem, but in future commits they will have exclusive rstat trees just as other subsystems will. Moving the base stat objects into a new struct allows the cgroup_rstat_cpu struct to become more compact since it now only contains the minimum amount of pointers needed for rstat participation. Subsystems will (in future commits) make use of the compact cgroup_rstat_cpu struct while avoiding the memory overhead of the base stat objects which they will not use. An instance of the new struct cgroup_rstat_base_cpu was placed on the cgroup struct so it can retain ownership of these base stats common to all cgroups. A helper function was added for looking up the cpu-specific base stats of a given cgroup. Finally, initialization and variable names were adjusted where applicable. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5bc8f55c8cca..1bf2e8db6dac 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -344,10 +344,29 @@ struct cgroup_base_stat { * frequency decreases the cost of each read. * * This struct hosts both the fields which implement the above - - * updated_children and updated_next - and the fields which track basic - * resource statistics on top of it - bsync, bstat and last_bstat. + * updated_children and updated_next. */ struct cgroup_rstat_cpu { + /* + * Child cgroups with stat updates on this cpu since the last read + * are linked on the parent's ->updated_children through + * ->updated_next. + * + * In addition to being more compact, singly-linked list pointing + * to the cgroup makes it unnecessary for each per-cpu struct to + * point back to the associated cgroup. + * + * Protected by per-cpu cgroup_rstat_cpu_lock. + */ + struct cgroup *updated_children; /* terminated by self cgroup */ + struct cgroup *updated_next; /* NULL iff not on the list */ +}; + +/* + * This struct hosts the fields which track basic resource statistics on + * top of it - bsync, bstat and last_bstat. + */ +struct cgroup_rstat_base_cpu { /* * ->bsync protects ->bstat. These are the only fields which get * updated in the hot path. @@ -374,20 +393,6 @@ struct cgroup_rstat_cpu { * deltas to propagate to the per-cpu subtree_bstat. */ struct cgroup_base_stat last_subtree_bstat; - - /* - * Child cgroups with stat updates on this cpu since the last read - * are linked on the parent's ->updated_children through - * ->updated_next. - * - * In addition to being more compact, singly-linked list pointing - * to the cgroup makes it unnecessary for each per-cpu struct to - * point back to the associated cgroup. - * - * Protected by per-cpu cgroup_rstat_cpu_lock. - */ - struct cgroup *updated_children; /* terminated by self cgroup */ - struct cgroup *updated_next; /* NULL iff not on the list */ }; struct cgroup_freezer_state { @@ -518,6 +523,7 @@ struct cgroup { /* per-cpu recursive resource statistics */ struct cgroup_rstat_cpu __percpu *rstat_cpu; + struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu; struct list_head rstat_css_list; /* -- cgit v1.2.3 From 845a7245801142bfff411bc84afa8cdbc789562f Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 3 Apr 2025 18:10:47 -0700 Subject: cgroup: add helper for checking when css is cgroup::self The cgroup struct has a css field called "self". The main difference between this css and the others found in the cgroup::subsys array is that cgroup::self has a NULL subsystem pointer. There are several places where checks are performed to determine whether the css in question is cgroup::self or not. Instead of accessing css->ss directly, introduce a helper function that shows the intent and use where applicable. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e7da3c3b098b..65d95bb2199f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -347,6 +347,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } +static inline bool css_is_cgroup(struct cgroup_subsys_state *css) +{ + return css->ss == NULL; +} + static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); -- cgit v1.2.3 From a97915559f5c5ff1972d678b94fd460c72a3b5f2 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 3 Apr 2025 18:10:48 -0700 Subject: cgroup: change rstat function signatures from cgroup-based to css-based This non-functional change serves as preparation for moving to subsystem-based rstat trees. To simplify future commits, change the signatures of existing cgroup-based rstat functions to become css-based and rename them to reflect that. Though the signatures have changed, the implementations have not. Within these functions use the css->cgroup pointer to obtain the associated cgroup and allow code to function the same just as it did before this patch. At applicable call sites, pass the subsystem-specific css pointer as an argument or pass a pointer to cgroup::self if not in subsystem context. Note that cgroup_rstat_updated_list() and cgroup_rstat_push_children() are not altered yet since there would be a larger amount of css to cgroup conversions which may overcomplicate the code at this intermediate phase. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 +- include/linux/cgroup.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1bf2e8db6dac..e58bfb880111 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -536,7 +536,7 @@ struct cgroup { /* * A singly-linked list of cgroup structures to be rstat flushed. * This is a scratch field to be used exclusively by - * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. + * css_rstat_flush_locked() and protected by cgroup_rstat_lock. */ struct cgroup *rstat_flush_next; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 65d95bb2199f..1f5b0a4a3356 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -693,8 +693,8 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup scalable recursive statistics. */ -void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); -void cgroup_rstat_flush(struct cgroup *cgrp); +void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); +void css_rstat_flush(struct cgroup_subsys_state *css); /* * Basic resource stats. -- cgit v1.2.3 From dd8a9807fa03666bff52cb28472fb227eaac36c9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 31 Mar 2025 13:35:59 +0300 Subject: spi: Group CS related fields in struct spi_device The CS related fields are sparse in the struct spi_device. Group them. While at it, fix the comment style of cs_index_mask. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250331103609.4160281-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 0ba5e49bace4..e10f0ebb8250 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -136,13 +136,6 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @max_speed_hz: Maximum clock rate to be used with this chip * (on this board); may be changed by the device's driver. * The spi_transfer.speed_hz can override this for each transfer. - * @chip_select: Array of physical chipselect, spi->chipselect[i] gives - * the corresponding physical CS for logical CS i. - * @mode: The spi mode defines how data is clocked out and in. - * This may be changed by the device's driver. - * The "active low" default for chipselect mode can be overridden - * (by specifying SPI_CS_HIGH) as can the "MSB first" default for - * each word in a transfer (by specifying SPI_LSB_FIRST). * @bits_per_word: Data transfers involve one or more words; word sizes * like eight or 12 bits are common. In-memory wordsizes are * powers of two bytes (e.g. 20 bit samples use 32 bits). @@ -150,6 +143,11 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * default (0) indicating protocol words are eight bit bytes. * The spi_transfer.bits_per_word can override this for each transfer. * @rt: Make the pump thread real time priority. + * @mode: The spi mode defines how data is clocked out and in. + * This may be changed by the device's driver. + * The "active low" default for chipselect mode can be overridden + * (by specifying SPI_CS_HIGH) as can the "MSB first" default for + * each word in a transfer (by specifying SPI_LSB_FIRST). * @irq: Negative, or the number passed to request_irq() to receive * interrupts from this device. * @controller_state: Controller's runtime state @@ -162,8 +160,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * the device will bind to the named driver and only the named driver. * Do not set directly, because core frees it; use driver_set_override() to * set or clear it. - * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines - * (optional, NULL when not using a GPIO line) + * @pcpu_statistics: statistics for the spi_device * @word_delay: delay to be inserted between consecutive * words of a transfer * @cs_setup: delay to be introduced by the controller after CS is asserted @@ -171,8 +168,11 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @cs_inactive: delay to be introduced by the controller after CS is * deasserted. If @cs_change_delay is used from @spi_transfer, then the * two delays will be added up. - * @pcpu_statistics: statistics for the spi_device + * @chip_select: Array of physical chipselect, spi->chipselect[i] gives + * the corresponding physical CS for logical CS i. * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array + * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines + * (optional, NULL when not using a GPIO line) * * A @spi_device is used to interchange data between an SPI target device * (usually a discrete chip) and CPU memory. @@ -187,7 +187,6 @@ struct spi_device { struct device dev; struct spi_controller *controller; u32 max_speed_hz; - u8 chip_select[SPI_CS_CNT_MAX]; u8 bits_per_word; bool rt; #define SPI_NO_TX BIT(31) /* No transmit wire */ @@ -218,23 +217,29 @@ struct spi_device { void *controller_data; char modalias[SPI_NAME_SIZE]; const char *driver_override; - struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + + /* The statistics */ + struct spi_statistics __percpu *pcpu_statistics; + struct spi_delay word_delay; /* Inter-word delay */ + /* CS delays */ struct spi_delay cs_setup; struct spi_delay cs_hold; struct spi_delay cs_inactive; - /* The statistics */ - struct spi_statistics __percpu *pcpu_statistics; + u8 chip_select[SPI_CS_CNT_MAX]; - /* Bit mask of the chipselect(s) that the driver need to use from - * the chipselect array.When the controller is capable to handle + /* + * Bit mask of the chipselect(s) that the driver need to use from + * the chipselect array. When the controller is capable to handle * multiple chip selects & memories are connected in parallel * then more than one bit need to be set in cs_index_mask. */ u32 cs_index_mask : SPI_CS_CNT_MAX; + struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + /* * Likely need more hooks for more protocol options affecting how * the controller talks to each chip, like: -- cgit v1.2.3 From f7b86e0e75bc234751cb7a82d888083a57ef28b2 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 24 Mar 2025 21:15:17 +0000 Subject: crypto: ccp - Add new SEV/SNP platform shutdown API Add new API interface to do SEV/SNP platform shutdown when KVM module is unloaded. Reviewed-by: Dionna Glaze Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Signed-off-by: Herbert Xu --- include/linux/psp-sev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index f3cad182d4ef..0b3a36bdaa90 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -954,6 +954,7 @@ int sev_do_cmd(int cmd, void *data, int *psp_ret); void *psp_copy_user_blob(u64 uaddr, u32 len); void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); +void sev_platform_shutdown(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ @@ -988,6 +989,8 @@ static inline void *snp_alloc_firmware_page(gfp_t mask) static inline void snp_free_firmware_page(void *addr) { } +static inline void sev_platform_shutdown(void) { } + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ #endif /* __PSP_SEV_H__ */ -- cgit v1.2.3 From 1be1cd03a93339f14c8f4fe300bca321fddc6478 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 3 Apr 2025 18:59:14 +0300 Subject: gpiolib: acpi: Reduce memory footprint for struct acpi_gpio_params The line_index member in the struct acpi_gpio_params replicates what is covered in the ACPI GpioIo() or GpioInt() resource. The value there is limited to 16-bit one, so we don't really need to have a full 32-bit storage for it. Together with followed boolean the structure will be smaller. add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-3 (-3) Function old new delta acpi_gpio_property_lookup 417 414 -3 Total: Before=15361, After=15358, chg -0.02% `pahole` difference before and after: - /* size: 12, cachelines: 1, members: 3 */ - /* padding: 3 */ + /* size: 8, cachelines: 1, members: 3 */ + /* padding: 1 */ Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/20250403160034.2680485-4-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko --- include/linux/gpio/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 45b651c05b9c..899179972bec 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -587,7 +587,7 @@ struct gpio_desc *devm_fwnode_gpiod_get(struct device *dev, struct acpi_gpio_params { unsigned int crs_entry_index; - unsigned int line_index; + unsigned short line_index; bool active_low; }; -- cgit v1.2.3 From 5741909697a31cfb08e45d56b4211959fb791487 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 19 Mar 2025 14:01:32 +1100 Subject: VFS: improve interface for lookup_one functions The family of functions: lookup_one() lookup_one_unlocked() lookup_one_positive_unlocked() appear designed to be used by external clients of the filesystem rather than by filesystems acting on themselves as the lookup_one_len family are used. They are used by: btrfs/ioctl - which is a user-space interface rather than an internal activity exportfs - i.e. from nfsd or the open_by_handle_at interface overlayfs - at access the underlying filesystems smb/server - for file service They should be used by nfsd (more than just the exportfs path) and cachefs but aren't. It would help if the documentation didn't claim they should "not be called by generic code". Also the path component name is passed as "name" and "len" which are (confusingly?) separate by the "base". In some cases the len in simply "strlen" and so passing a qstr using QSTR() would make the calling clearer. Other callers do pass separate name and len which are stored in a struct. Sometimes these are already stored in a qstr, other times it easily could be. So this patch changes these three functions to receive a 'struct qstr *', and improves the documentation. QSTR_LEN() is added to make it easy to pass a QSTR containing a known len. [brauner@kernel.org: take a struct qstr pointer] Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250319031545.2999807-2-neil@brown.name Signed-off-by: Christian Brauner --- include/linux/dcache.h | 3 ++- include/linux/namei.h | 9 ++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8d1395f945bf..e974e63bcdbc 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -57,7 +57,8 @@ struct qstr { }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } -#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n)) +#define QSTR_LEN(n,l) (struct qstr)QSTR_INIT(n,l) +#define QSTR(n) QSTR_LEN(n, strlen(n)) extern const struct qstr empty_name; extern const struct qstr slash_name; diff --git a/include/linux/namei.h b/include/linux/namei.h index e3042176cdf4..ba02304a2a8a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -73,13 +73,12 @@ extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); -struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int); +struct dentry *lookup_one(struct mnt_idmap *, struct qstr *, struct dentry *); struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, - const char *name, struct dentry *base, - int len); + struct qstr *name, struct dentry *base); struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, - const char *name, - struct dentry *base, int len); + struct qstr *name, + struct dentry *base); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); -- cgit v1.2.3 From 0a02e1f4a54ace747304687ced3b76d159e58914 Mon Sep 17 00:00:00 2001 From: Yixun Lan Date: Wed, 26 Mar 2025 06:06:19 +0800 Subject: irqdomain: Support three-cell scheme interrupts Add new function *_twothreecell() to extend support to parse three-cell interrupts which encoded as , the translate function will retrieve irq number and flag from last two cells. This API will be used in gpio irq driver which need to work with two or three cells cases. Signed-off-by: Yixun Lan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250326-04-gpio-irq-threecell-v3-1-aab006ab0e00@gentoo.org --- include/linux/irqdomain.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index bb7111105296..df7e9278c8ac 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -571,16 +571,16 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type); - -int irq_domain_translate_twocell(struct irq_domain *d, - struct irq_fwspec *fwspec, - unsigned long *out_hwirq, - unsigned int *out_type); - -int irq_domain_translate_onecell(struct irq_domain *d, - struct irq_fwspec *fwspec, - unsigned long *out_hwirq, - unsigned int *out_type); +int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type); + +int irq_domain_translate_onecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); +int irq_domain_translate_twocell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); +int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); /* IPI functions */ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest); -- cgit v1.2.3 From 7b73c12c6ebf006ad496f0e38a605d92dfe05157 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 15:59:59 +0100 Subject: shmem: Add shmem_writeout() This will be the replacement for shmem_writepage(). Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250402150005.2309458-6-willy@infradead.org Reviewed-by: Baolin Wang Signed-off-by: Christian Brauner --- include/linux/shmem_fs.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0b273a7b9f01..5f03a39a26f7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -104,10 +104,11 @@ static inline bool shmem_mapping(struct address_space *mapping) return false; } #endif /* CONFIG_SHMEM */ -extern void shmem_unlock_mapping(struct address_space *mapping); -extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, +void shmem_unlock_mapping(struct address_space *mapping); +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); -extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); +int shmem_writeout(struct folio *folio, struct writeback_control *wbc); +void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v1.2.3 From 6b0dfabb35550cb7b0808585dea6c24971d685d3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 16:00:03 +0100 Subject: fs: Remove aops->writepage All callers and implementations are now removed, so remove the operation and update the documentation to match. Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250402150005.2309458-10-willy@infradead.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..f7beefb917a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -433,7 +433,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) } struct address_space_operations { - int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ -- cgit v1.2.3 From 559b3bbfa978ce3b23dc9c52d09a0eddca52c439 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 27 Mar 2025 10:06:10 -0400 Subject: locking/percpu-rwsem: add freezable alternative to down_read Percpu-rwsems are used for superblock locking. However, we know the read percpu-rwsem we take for sb_start_write() on a frozen filesystem needs not to inhibit system from suspending or hibernating. That means it needs to wait with TASK_UNINTERRUPTIBLE | TASK_FREEZABLE. Introduce a new percpu_down_read_freezable() that allows us to control whether TASK_FREEZABLE is added to the wait flags. Signed-off-by: James Bottomley Link: https://lore.kernel.org/r/20250327140613.25178-2-James.Bottomley@HansenPartnership.com Signed-off-by: Christian Brauner --- include/linux/percpu-rwsem.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index af7d75ede619..288f5235649a 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -43,9 +43,10 @@ is_static struct percpu_rw_semaphore name = { \ #define DEFINE_STATIC_PERCPU_RWSEM(name) \ __DEFINE_PERCPU_RWSEM(name, static) -extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool); +extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool, bool); -static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +static inline void percpu_down_read_internal(struct percpu_rw_semaphore *sem, + bool freezable) { might_sleep(); @@ -63,7 +64,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else - __percpu_down_read(sem, false); /* Unconditional memory barrier */ + __percpu_down_read(sem, false, freezable); /* Unconditional memory barrier */ /* * The preempt_enable() prevents the compiler from * bleeding the critical section out. @@ -71,6 +72,17 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) preempt_enable(); } +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +{ + percpu_down_read_internal(sem, false); +} + +static inline void percpu_down_read_freezable(struct percpu_rw_semaphore *sem, + bool freeze) +{ + percpu_down_read_internal(sem, freeze); +} + static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { bool ret = true; @@ -82,7 +94,7 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else - ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ + ret = __percpu_down_read(sem, true, false); /* Unconditional memory barrier */ preempt_enable(); /* * The barrier() from preempt_enable() prevents the compiler from -- cgit v1.2.3 From f73bae83675b860cae9f58dba233b6be92e750ca Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 27 Mar 2025 10:06:11 -0400 Subject: fs: allow all writers to be frozen During freeze/thaw we need to be able to freeze all writers during suspend/hibernate. Otherwise tasks such as systemd-journald that mmap a file and write to it will not be frozen after we've already frozen the filesystem. This has some risk of not being able to freeze processes in case a process has acquired SB_FREEZE_PAGEFAULT under mmap_sem or SB_FREEZE_INTERNAL under some other filesytem specific lock. If the filesystem is frozen, a task can block on the frozen filesystem with e.g., mmap_sem held. If some other task then blocks on grabbing that mmap_sem, hibernation ill fail because it is unable to hibernate a task holding mmap_sem. This could be fixed by making a range of filesystem related locks use freezable sleeping. That's impractical and not warranted just for suspend/hibernate. Assume that this is an infrequent problem and we've given userspace a way to skip filesystem freezing through a sysfs file. Link: https://lore.kernel.org/r/20250402-work-freeze-v2-2-6719a97b52ac@kernel.org Link: https://lore.kernel.org/r/20250327140613.25178-3-James.Bottomley@HansenPartnership.com [brauner: make all freeze levels set TASK_FREEZABLE and rewrite commit message] Reviewed-by: Jan Kara Signed-off-by: James Bottomley Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..35b81f497904 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1780,7 +1780,7 @@ static inline void __sb_end_write(struct super_block *sb, int level) static inline void __sb_start_write(struct super_block *sb, int level) { - percpu_down_read(sb->s_writers.rw_sem + level - 1); + percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) -- cgit v1.2.3 From 2992476528aeecbaee17ba0a6396a817481205a3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 29 Mar 2025 09:42:17 +0100 Subject: super: use a common iterator (Part 1) Use a common iterator for all callbacks. Link: https://lore.kernel.org/r/20250329-work-freeze-v2-4-a47af37ecc3d@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 35b81f497904..5e007bffd00e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3515,7 +3515,11 @@ extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); -extern void iterate_supers(void (*)(struct super_block *, void *), void *); +void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl); +static inline void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + __iterate_supers(f, arg, false); +} extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -- cgit v1.2.3 From b47e42d10e8c20525febccbd6e0dc8528861aea4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 29 Mar 2025 09:42:18 +0100 Subject: super: use common iterator (Part 2) Use a common iterator for all callbacks. We could go for something even more elaborate (advance step-by-step similar to iov_iter) but I really don't think this is warranted. Link: https://lore.kernel.org/r/20250329-work-freeze-v2-5-a47af37ecc3d@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5e007bffd00e..96f8c8a794ea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3515,11 +3515,7 @@ extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); -void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl); -static inline void iterate_supers(void (*f)(struct super_block *, void *), void *arg) -{ - __iterate_supers(f, arg, false); -} +extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -- cgit v1.2.3 From 06f2f68a670aae28b825065439301831e74da880 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2025 15:31:15 +0100 Subject: genirq/generic-chip: Make locking unconditional The SMP conditional wrappers around raw_spin_[un]lock() have no real value. On !SMP kernels the lock operations are NOOPs except for a preempt_disable/enable() pair on PREEMPT enabled kernels, which are not really worth to optimize for. Aside of that this evades lockdep on !SMP kernels. Remove the !SMP stubs and make it unconditional. No functional change. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250313142524.011345765@linutronix.de --- include/linux/irq.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index dd5df1e2d032..500772943207 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1222,7 +1222,6 @@ static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) -#ifdef CONFIG_SMP static inline void irq_gc_lock(struct irq_chip_generic *gc) { raw_spin_lock(&gc->lock); @@ -1232,10 +1231,6 @@ static inline void irq_gc_unlock(struct irq_chip_generic *gc) { raw_spin_unlock(&gc->lock); } -#else -static inline void irq_gc_lock(struct irq_chip_generic *gc) { } -static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } -#endif /* * The irqsave variants are for usage in non interrupt code. Do not use -- cgit v1.2.3 From 7ae844a6650c5c15ccfbf76ed767e7f2cc61ec1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2025 15:31:28 +0100 Subject: genirq/generic-chip: Remove unused lock wrappers All users are converted to lock guards. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250313142524.388478168@linutronix.de --- include/linux/irq.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 500772943207..d896d3a471ec 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1222,26 +1222,6 @@ static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) -static inline void irq_gc_lock(struct irq_chip_generic *gc) -{ - raw_spin_lock(&gc->lock); -} - -static inline void irq_gc_unlock(struct irq_chip_generic *gc) -{ - raw_spin_unlock(&gc->lock); -} - -/* - * The irqsave variants are for usage in non interrupt code. Do not use - * them in irq_chip callbacks. Use irq_gc_lock() instead. - */ -#define irq_gc_lock_irqsave(gc, flags) \ - raw_spin_lock_irqsave(&(gc)->lock, flags) - -#define irq_gc_unlock_irqrestore(gc, flags) \ - raw_spin_unlock_irqrestore(&(gc)->lock, flags) - static inline void irq_reg_writel(struct irq_chip_generic *gc, u32 val, int reg_offset) { -- cgit v1.2.3 From 1ce4c3aeef333be1e6290ec6d1f7891c2bfc7a1f Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 1 Apr 2025 11:37:16 +0200 Subject: firmware: sysfb: Move bpp-depth calculation into screen_info helper Move the calculation of the bits per pixels for screen_info into a helper function. This will make it available to other callers besides the firmware code. Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Link: https://lore.kernel.org/r/20250401094056.32904-14-tzimmermann@suse.de --- include/linux/screen_info.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h index 6a4a3cec4638..ab3cffbb58b7 100644 --- a/include/linux/screen_info.h +++ b/include/linux/screen_info.h @@ -128,6 +128,8 @@ static inline unsigned int screen_info_video_type(const struct screen_info *si) ssize_t screen_info_resources(const struct screen_info *si, struct resource *r, size_t num); +u32 __screen_info_lfb_bits_per_pixel(const struct screen_info *si); + #if defined(CONFIG_PCI) void screen_info_apply_fixups(void); struct pci_dev *screen_info_pci_dev(const struct screen_info *si); -- cgit v1.2.3 From 814d270b31d27b6ea4f722b8ae2db9802fe332ff Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 1 Apr 2025 11:37:21 +0200 Subject: drm/sysfb: vesadrm: Add gamma correction Add palette support and export GAMMA properties via sysfs. User-space compositors can use this interface for programming gamma ramps or night mode. Vesadrm supports palette updates via VGA DAC registers or VESA palette calls. Up to 256 palette entries are available. Userspace always supplies gamma ramps of 256 entries. If the native color format does not match this because pixel component have less then 8 bits, vesadrm interpolates among the palette entries. The code uses CamelCase style in a few places to match the VESA manuals. v3: - fix coding style v2: - use CONFIG_X86_32 instead of __i386__ (checkpatch) - protect struct vesadrm.pmi with CONFIG_X86_32 Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Link: https://lore.kernel.org/r/20250401094056.32904-19-tzimmermann@suse.de --- include/linux/screen_info.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h index ab3cffbb58b7..923d68e07679 100644 --- a/include/linux/screen_info.h +++ b/include/linux/screen_info.h @@ -126,6 +126,13 @@ static inline unsigned int screen_info_video_type(const struct screen_info *si) return VIDEO_TYPE_CGA; } +static inline u32 __screen_info_vesapm_info_base(const struct screen_info *si) +{ + if (si->vesapm_seg < 0xc000) + return 0; + return (si->vesapm_seg << 4) + si->vesapm_off; +} + ssize_t screen_info_resources(const struct screen_info *si, struct resource *r, size_t num); u32 __screen_info_lfb_bits_per_pixel(const struct screen_info *si); -- cgit v1.2.3 From af1c968d25c7c44cd2738349c479f8b610a3fc40 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Mon, 7 Apr 2025 13:23:47 +0200 Subject: ASoC: Intel: avs: PTL-based platforms support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define handlers specific to ACE platforms, that Frisco Lake (FCL), a PantherLake (PTL)-based platform, is founded upon. Most operations are still inherited from their predecessors with the major difference being AudioDSP cores management - replaced by DSP-domain power management. Software has to ensure the DSP domain is both powered on and its power-gating disabled before it can be utilized for streaming. Reviewed-by: Amadeusz Sławiński Signed-off-by: Cezary Rojewski Acked-by: Liam Girdwood Link: https://patch.msgid.link/20250407112352.3720779-6-cezary.rojewski@intel.com Signed-off-by: Mark Brown --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 2e28182c3af0..981ed45cc45e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3070,6 +3070,7 @@ #define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 #define PCI_DEVICE_ID_INTEL_5100_22 0x65f6 #define PCI_DEVICE_ID_INTEL_IOAT_SCNB 0x65ff +#define PCI_DEVICE_ID_INTEL_HDA_FCL 0x67a8 #define PCI_DEVICE_ID_INTEL_82371SB_0 0x7000 #define PCI_DEVICE_ID_INTEL_82371SB_1 0x7010 #define PCI_DEVICE_ID_INTEL_82371SB_2 0x7020 -- cgit v1.2.3 From 83b9ae77f06607d19f7d3dcc6008742051137b27 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Fri, 4 Apr 2025 11:03:30 +0200 Subject: lib/string_helpers: Introduce parse_int_array() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Existing parse_inte_array_user() works with __user buffers only. Separate array parsing from __user bits so the functionality can be utilized with kernel buffers too. Cc: Andy Shevchenko Reviewed-by: Amadeusz Sławiński Signed-off-by: Cezary Rojewski Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20250404090337.3564117-2-cezary.rojewski@intel.com Signed-off-by: Mark Brown --- include/linux/string_helpers.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index e93fbb5b0c01..3fb88a1e9898 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -31,6 +31,7 @@ enum string_size_units { int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, char *buf, int len); +int parse_int_array(const char *buf, size_t count, int **array); int parse_int_array_user(const char __user *from, size_t count, int **array); #define UNESCAPE_SPACE BIT(0) -- cgit v1.2.3 From 5f3e0b4a1f59ab616a09a45fddddefa6ef756229 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 7 Apr 2025 01:58:05 +0200 Subject: fs: predict not having to do anything in fdput() This matches the annotation in fdget(). Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/20250406235806.1637000-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 302f11355b10..af1768d934a0 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -59,7 +59,7 @@ static inline struct fd CLONED_FD(struct file *f) static inline void fdput(struct fd fd) { - if (fd.word & FDPUT_FPUT) + if (unlikely(fd.word & FDPUT_FPUT)) fput(fd_file(fd)); } -- cgit v1.2.3 From fa6fe07d1536361a227d655e69ca270faf28fdbe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 19 Mar 2025 14:01:35 +1100 Subject: VFS: rename lookup_one_len family to lookup_noperm and remove permission check The lookup_one_len family of functions is (now) only used internally by a filesystem on itself either - in a context where permission checking is irrelevant such as by a virtual filesystem populating itself, or xfs accessing its ORPHANAGE or dquota accessing the quota file; or - in a context where a permission check (MAY_EXEC on the parent) has just been performed such as a network filesystem finding in "silly-rename" file in the same directory. This is also the context after the _parentat() functions where currently lookup_one_qstr_excl() is used. So the permission check is pointless. The name "one_len" is unhelpful in understanding the purpose of these functions and should be changed. Most of the callers pass the len as "strlen()" so using a qstr and QSTR() can simplify the code. This patch renames these functions (include lookup_positive_unlocked() which is part of the family despite the name) to have a name based on "lookup_noperm". They are changed to receive a 'struct qstr' instead of separate name and len. In a few cases the use of QSTR() results in a new call to strlen(). try_lookup_noperm() takes a pointer to a qstr instead of the whole qstr. This is consistent with d_hash_and_lookup() (which is nearly identical) and useful for lookup_noperm_unlocked(). The new lookup_noperm_common() doesn't take a qstr yet. That will be tidied up in a subsequent patch. Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250319031545.2999807-5-neil@brown.name Signed-off-by: Christian Brauner --- include/linux/namei.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index ba02304a2a8a..cb6ce97782e0 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -69,10 +69,10 @@ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); -extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); -extern struct dentry *lookup_one_len(const char *, struct dentry *, int); -extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); -extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); +extern struct dentry *try_lookup_noperm(struct qstr *, struct dentry *); +extern struct dentry *lookup_noperm(struct qstr *, struct dentry *); +extern struct dentry *lookup_noperm_unlocked(struct qstr *, struct dentry *); +extern struct dentry *lookup_noperm_positive_unlocked(struct qstr *, struct dentry *); struct dentry *lookup_one(struct mnt_idmap *, struct qstr *, struct dentry *); struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); -- cgit v1.2.3 From 06c567403ae5a0b56005c2d4a184c903f572c844 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 19 Mar 2025 14:01:36 +1100 Subject: Use try_lookup_noperm() instead of d_hash_and_lookup() outside of VFS try_lookup_noperm() and d_hash_and_lookup() are nearly identical. The former does some validation of the name where the latter doesn't. Outside of the VFS that validation is likely valuable, and having only one exported function for this task is certainly a good idea. So make d_hash_and_lookup() local to VFS files and change all other callers to try_lookup_noperm(). Note that the arguments are swapped. Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250319031545.2999807-6-neil@brown.name Signed-off-by: Christian Brauner --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index e974e63bcdbc..a324f82df562 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -288,7 +288,6 @@ extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); -extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); static inline unsigned d_count(const struct dentry *dentry) { -- cgit v1.2.3 From da916e96e2dedcb2d40de77a7def833d315b81a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 Oct 2024 10:21:41 +0200 Subject: perf: Make perf_pmu_unregister() useable Previously it was only safe to call perf_pmu_unregister() if there were no active events of that pmu around -- which was impossible to guarantee since it races all sorts against perf_init_event(). Rework the whole thing by: - keeping track of all events for a given pmu - 'hiding' the pmu from perf_init_event() - waiting for the appropriate (s)rcu grace periods such that all prior references to the PMU will be completed - detaching all still existing events of that pmu (see first point) and moving them to a new REVOKED state. - actually freeing the pmu data. Where notably the new REVOKED state must inhibit all event actions from reaching code that wants to use event->pmu. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ravi Bangoria Link: https://lkml.kernel.org/r/20250307193723.525402029@infradead.org --- include/linux/perf_event.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0069ba6866a4..7f49a58b271d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -325,6 +325,9 @@ struct perf_output_handle; struct pmu { struct list_head entry; + spinlock_t events_lock; + struct list_head events; + struct module *module; struct device *dev; struct device *parent; @@ -622,9 +625,10 @@ struct perf_addr_filter_range { * enum perf_event_state - the states of an event: */ enum perf_event_state { - PERF_EVENT_STATE_DEAD = -4, - PERF_EVENT_STATE_EXIT = -3, - PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_DEAD = -5, + PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ + PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ + PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, @@ -865,6 +869,7 @@ struct perf_event { void *security; #endif struct list_head sb_list; + struct list_head pmu_list; /* * Certain events gets forwarded to another pmu internally by over- @@ -1155,7 +1160,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); -extern void perf_pmu_unregister(struct pmu *pmu); +extern int perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); @@ -1760,7 +1765,7 @@ static inline bool needs_branch_stack(struct perf_event *event) static inline bool has_aux(struct perf_event *event) { - return event->pmu->setup_aux; + return event->pmu && event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) -- cgit v1.2.3 From 4dfe3232cc04325a09e96f6c7f9546ba6c0b132b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:13 -0700 Subject: perf/x86: Add dynamic constraint More and more features require a dynamic event constraint, e.g., branch counter logging, auto counter reload, Arch PEBS, etc. Add a generic flag, PMU_FL_DYN_CONSTRAINT, to indicate the case. It avoids keeping adding the individual flag in intel_cpuc_prepare(). Add a variable dyn_constraint in the struct hw_perf_event to track the dynamic constraint of the event. Apply it if it's updated. Apply the generic dynamic constraint for branch counter logging. Many features on and after V6 require dynamic constraint. So unconditionally set the flag for V6+. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-2-kan.liang@linux.intel.com --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7f49a58b271d..54dad174ed7a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -158,6 +158,7 @@ struct hw_perf_event { struct { /* hardware */ u64 config; u64 last_tag; + u64 dyn_constraint; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; -- cgit v1.2.3 From c9449c8506a5df5052ef4d17867699517b10b55a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:15 -0700 Subject: perf: Extend the bit width of the arch-specific flag The auto counter reload feature requires an event flag to indicate an auto counter reload group, which can only be scheduled on specific counters that enumerated in CPUID. However, the hw_perf_event.flags has run out on X86. Two solutions were considered to address the issue. - Currently, 20 bits are reserved for the architecture-specific flags. Only the bit 31 is used for the generic flag. There is still plenty of space left. Reserve 8 more bits for the arch-specific flags. - Add a new X86 specific hw_perf_event.flags1 to support more flags. The former is implemented. Enough room is still left in the global generic flag. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-4-kan.liang@linux.intel.com --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 54dad174ed7a..5c547329cf02 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -144,7 +144,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x000fffff +#define PERF_EVENT_FLAG_ARCH 0x0fffffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); -- cgit v1.2.3 From ec980e4facef8110f6fce27e5b6344660117f01f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:17 -0700 Subject: perf/x86/intel: Support auto counter reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The relative rates among two or more events are useful for performance analysis, e.g., a high branch miss rate may indicate a performance issue. Usually, the samples with a relative rate that exceeds some threshold are more useful. However, the traditional sampling takes samples of events separately. To get the relative rates among two or more events, a high sample rate is required, which can bring high overhead. Many samples taken in the non-hotspot area are also dropped (useless) in the post-process. The auto counter reload (ACR) feature takes samples when the relative rate of two or more events exceeds some threshold, which provides the fine-grained information at a low cost. To support the feature, two sets of MSRs are introduced. For a given counter IA32_PMC_GPn_CTR/IA32_PMC_FXm_CTR, bit fields in the IA32_PMC_GPn_CFG_B/IA32_PMC_FXm_CFG_B MSR indicate which counter(s) can cause a reload of that counter. The reload value is stored in the IA32_PMC_GPn_CFG_C/IA32_PMC_FXm_CFG_C. The details can be found at Intel SDM (085), Volume 3, 21.9.11 Auto Counter Reload. In the hw_config(), an ACR event is specially configured, because the cause/reloadable counter mask has to be applied to the dyn_constraint. Besides the HW limit, e.g., not support perf metrics, PDist and etc, a SW limit is applied as well. ACR events in a group must be contiguous. It facilitates the later conversion from the event idx to the counter idx. Otherwise, the intel_pmu_acr_late_setup() has to traverse the whole event list again to find the "cause" event. Also, add a new flag PERF_X86_EVENT_ACR to indicate an ACR group, which is set to the group leader. The late setup() is also required for an ACR group. It's to convert the event idx to the counter idx, and saved it in hw.config1. The ACR configuration MSRs are only updated in the enable_event(). The disable_event() doesn't clear the ACR CFG register. Add acr_cfg_b/acr_cfg_c in the struct cpu_hw_events to cache the MSR values. It can avoid a MSR write if the value is not changed. Expose an acr_mask to the sysfs. The perf tool can utilize the new format to configure the relation of events in the group. The bit sequence of the acr_mask follows the events enabled order of the group. Example: Here is the snippet of the mispredict.c. Since the array has a random numbers, jumps are random and often mispredicted. The mispredicted rate depends on the compared value. For the Loop1, ~11% of all branches are mispredicted. For the Loop2, ~21% of all branches are mispredicted. main() { ... for (i = 0; i < N; i++) data[i] = rand() % 256; ... /* Loop 1 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 64) sum += data[i]; ... ... /* Loop 2 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 128) sum += data[i]; ... } Usually, a code with a high branch miss rate means a bad performance. To understand the branch miss rate of the codes, the traditional method usually samples both branches and branch-misses events. E.g., perf record -e "{cpu_atom/branch-misses/ppu, cpu_atom/branch-instructions/u}" -c 1000000 -- ./mispredict [ perf record: Woken up 4 times to write data ] [ perf record: Captured and wrote 0.925 MB perf.data (5106 samples) ] The 5106 samples are from both events and spread in both Loops. In the post-process stage, a user can know that the Loop 2 has a 21% branch miss rate. Then they can focus on the samples of branch-misses events for the Loop 2. With this patch, the user can generate the samples only when the branch miss rate > 20%. For example, perf record -e "{cpu_atom/branch-misses,period=200000,acr_mask=0x2/ppu, cpu_atom/branch-instructions,period=1000000,acr_mask=0x3/u}" -- ./mispredict (Two different periods are applied to branch-misses and branch-instructions. The ratio is set to 20%. If the branch-instructions is overflowed first, the branch-miss rate < 20%. No samples should be generated. All counters should be automatically reloaded. If the branch-misses is overflowed first, the branch-miss rate > 20%. A sample triggered by the branch-misses event should be generated. Just the counter of the branch-instructions should be automatically reloaded. The branch-misses event should only be automatically reloaded when the branch-instructions is overflowed. So the "cause" event is the branch-instructions event. The acr_mask is set to 0x2, since the event index in the group of branch-instructions is 1. The branch-instructions event is automatically reloaded no matter which events are overflowed. So the "cause" events are the branch-misses and the branch-instructions event. The acr_mask should be set to 0x3.) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.098 MB perf.data (2498 samples) ] $perf report Percent │154: movl $0x0,-0x14(%rbp) │ ↓ jmp 1af │ for (i = j; i < N; i++) │15d: mov -0x10(%rbp),%eax │ mov %eax,-0x18(%rbp) │ ↓ jmp 1a2 │ if (data[i] >= 128) │165: mov -0x18(%rbp),%eax │ cltq │ lea 0x0(,%rax,4),%rdx │ mov -0x8(%rbp),%rax │ add %rdx,%rax │ mov (%rax),%eax │ ┌──cmp $0x7f,%eax 100.00 0.00 │ ├──jle 19e │ │sum += data[i]; The 2498 samples are all from the branch-misses events for the Loop 2. The number of samples and overhead is significantly reduced without losing any information. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-6-kan.liang@linux.intel.com --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5c547329cf02..947ad12dfdbe 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -157,6 +157,7 @@ struct hw_perf_event { union { struct { /* hardware */ u64 config; + u64 config1; u64 last_tag; u64 dyn_constraint; unsigned long config_base; -- cgit v1.2.3 From a36283e2b683f172aa1760c77325e50b16c0f792 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Apr 2025 17:45:41 +0200 Subject: udp_tunnel: create a fastpath GRO lookup. Most UDP tunnels bind a socket to a local port, with ANY address, no peer and no interface index specified. Additionally it's quite common to have a single tunnel device per namespace. Track in each namespace the UDP tunnel socket respecting the above. When only a single one is present, store a reference in the netns. When such reference is not NULL, UDP tunnel GRO lookup just need to match the incoming packet destination port vs the socket local port. The tunnel socket never sets the reuse[port] flag[s]. When bound to no address and interface, no other socket can exist in the same netns matching the specified local port. Matching packets with non-local destination addresses will be aggregated, and eventually segmented as needed - no behavior changes intended. Restrict the optimization to kernel sockets only: it covers all the relevant use-cases, and user-space owned sockets could be disconnected and rebound after setup_udp_tunnel_sock(), breaking the uniqueness assumption Note that the UDP tunnel socket reference is stored into struct netns_ipv4 for both IPv4 and IPv6 tunnels. That is intentional to keep all the fastpath-related netns fields in the same struct and allow cacheline-based optimization. Currently both the IPv4 and IPv6 socket pointer share the same cacheline as the `udp_table` field. Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/41d16bc8d1257d567f9344c445b4ae0b4a91ede4.1744040675.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 0807e21cfec9..895240177f4f 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,6 +101,13 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; + + /* + * Accounting for the tunnel GRO fastpath. + * Unprotected by compilers guard, as it uses space available in + * the last UDP socket cacheline. + */ + struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) +static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) +{ +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); +#else + return NULL; +#endif +} + #endif /* _LINUX_UDP_H */ -- cgit v1.2.3 From 420aabef3ab5fa743afb4d3d391f03ef0e777ca8 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 7 Apr 2025 21:01:02 +0200 Subject: net: Drop unused @sk of __skb_try_recv_from_queue() __skb_try_recv_from_queue() deals with a queue, @sk is not used since commit e427cad6eee4 ("net: datagram: drop 'destructor' argument from several helpers"). Remove sk from function parameters, adapt callers. No functional change intended. Signed-off-by: Michal Luczaj Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250407-cleanup-drop-param-sk-v1-1-cd076979afac@rbox.co Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b974a277975a..f1381aff0f89 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4105,8 +4105,7 @@ static inline void skb_frag_list_init(struct sk_buff *skb) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); -- cgit v1.2.3 From 265daffe788aa1cc5925d0afcde4fe6e99c66638 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 7 Apr 2025 09:08:14 +0200 Subject: gpio: provide gpiod_is_equal() There are users in the kernel that directly compare raw GPIO descriptor pointers in order to determine whether they refer to the same physical GPIO pin. This accidentally works like this but is not guaranteed by any API contract. Let's provide a comparator function that hides the actual logic. Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20250407-gpiod-is-equal-v1-1-7d85f568ae6e@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 45b651c05b9c..7355abadaef4 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -180,6 +180,8 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, enum gpiod_flags flags, const char *label); +bool gpiod_is_equal(struct gpio_desc *desc, struct gpio_desc *other); + #else /* CONFIG_GPIOLIB */ #include @@ -547,6 +549,13 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, return ERR_PTR(-ENOSYS); } +static inline bool +gpiod_is_equal(struct gpio_desc *desc, struct gpio_desc *other) +{ + WARN_ON(desc || other); + return false; +} + #endif /* CONFIG_GPIOLIB */ #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_HTE) -- cgit v1.2.3 From dd293df6395a2c9e0fc4faa8defeceaa907e7717 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Sun, 29 Dec 2024 16:11:25 +0100 Subject: tracing: Move trace sysctls into trace.c Move trace ctl tables into their own const array in kernel/trace/trace.c. The sysctl table register is called with subsys_initcall placing if after its original place in proc_root_init. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Signed-off-by: Joel Granados Acked-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fbabc3d848b3..59774513ae45 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1298,16 +1298,9 @@ static inline void unpause_graph_tracing(void) { } #ifdef CONFIG_TRACING enum ftrace_dump_mode; -#define MAX_TRACER_SIZE 100 -extern char ftrace_dump_on_oops[]; extern int ftrace_dump_on_oops_enabled(void); -extern int tracepoint_printk; extern void disable_trace_on_warning(void); -extern int __disable_trace_on_warning; - -int tracepoint_printk_sysctl(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #else /* CONFIG_TRACING */ static inline void disable_trace_on_warning(void) { } -- cgit v1.2.3 From 67049b53e06fa1758df1463789f286a7cba67c50 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 8 Jan 2025 12:55:58 +0100 Subject: stack_tracer: move sysctl registration to kernel/trace/trace_stack.c Move stack_tracer_enabled into trace_stack_sysctl_table. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Acked-by: Steven Rostedt (Google) Signed-off-by: Joel Granados --- include/linux/ftrace.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 59774513ae45..95851a6fb942 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -569,8 +569,6 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, #ifdef CONFIG_STACK_TRACER -extern int stack_tracer_enabled; - int stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3 From e1bdbbc98279164d910d2de82a745f090a8b249f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 7 Apr 2025 13:36:55 -0500 Subject: ACPI: Add missing prototype for non CONFIG_SUSPEND/CONFIG_X86 case acpi_register_lps0_dev() and acpi_unregister_lps0_dev() may be used in drivers that don't require CONFIG_SUSPEND or compile on !X86. Add prototypes for those cases. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502191627.fRgoBwcZ-lkp@intel.com/ Signed-off-by: Mario Limonciello Link: https://patch.msgid.link/20250407183656.1503446-1-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3f2e93ed9730..fc372bbaa547 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1125,13 +1125,13 @@ void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state, acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b); -#if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) struct acpi_s2idle_dev_ops { struct list_head list_node; void (*prepare)(void); void (*check)(void); void (*restore)(void); }; +#if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg); void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg); int acpi_get_lps0_constraint(struct acpi_device *adev); @@ -1140,6 +1140,13 @@ static inline int acpi_get_lps0_constraint(struct device *dev) { return ACPI_STATE_UNKNOWN; } +static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg) +{ + return -ENODEV; +} +static inline void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg) +{ +} #endif /* CONFIG_SUSPEND && CONFIG_X86 */ void arch_reserve_mem_area(acpi_physical_address addr, size_t size); #else -- cgit v1.2.3 From 092d00ead733563f6d278295e0b5c5f97558b726 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Mar 2025 11:56:38 +0100 Subject: cleanup: Provide retain_and_null_ptr() In cases where an allocation is consumed by another function, the allocation needs to be retained on success or freed on failure. The code pattern is usually: struct foo *f = kzalloc(sizeof(*f), GFP_KERNEL); struct bar *b; ,,, // Initialize f ... if (ret) goto free; ... bar = bar_create(f); if (!bar) { ret = -ENOMEM; goto free; } ... return 0; free: kfree(f); return ret; This prevents using __free(kfree) on @f because there is no canonical way to tell the cleanup code that the allocation should not be freed. Abusing no_free_ptr() by force ignoring the return value is not really a sensible option either. Provide an explicit macro retain_and_null_ptr(), which NULLs the cleanup pointer. That makes it easy to analyze and reason about. Signed-off-by: Thomas Gleixner Reviewed-by: Jonathan Cameron Reviewed-by: James Bottomley Link: https://lore.kernel.org/all/20250319105506.083538907@linutronix.de --- include/linux/cleanup.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 7e57047e1564..7093e1d08af0 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -216,6 +216,25 @@ const volatile void * __must_check_fn(const volatile void *val) #define return_ptr(p) return no_free_ptr(p) +/* + * Only for situations where an allocation is handed in to another function + * and consumed by that function on success. + * + * struct foo *f __free(kfree) = kzalloc(sizeof(*f), GFP_KERNEL); + * + * setup(f); + * if (some_condition) + * return -EINVAL; + * .... + * ret = bar(f); + * if (!ret) + * retain_and_null_ptr(f); + * return ret; + * + * After retain_and_null_ptr(f) the variable f is NULL and cannot be + * dereferenced anymore. + */ +#define retain_and_null_ptr(p) ((void)__get_and_null(p, NULL)) /* * DEFINE_CLASS(name, type, exit, init, init_args...): -- cgit v1.2.3 From 0dac2b09303c89ffb21d25d40bf6aefd39f214b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Mar 2025 11:56:39 +0100 Subject: genirq/msi: Use lock guards for MSI descriptor locking Provide a lock guard for MSI descriptor locking and update the core code accordingly. No functional change intended. Signed-off-by: Thomas Gleixner Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/all/20250319105506.144672678@linutronix.de --- include/linux/irqdomain.h | 2 ++ include/linux/msi.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index bb7111105296..2f955716cf82 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -281,6 +281,8 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) void irq_domain_free_fwnode(struct fwnode_handle *fwnode); +DEFINE_FREE(irq_domain_free_fwnode, struct fwnode_handle *, if (_T) irq_domain_free_fwnode(_T)) + struct irq_domain_chip_generic_info; /** diff --git a/include/linux/msi.h b/include/linux/msi.h index 86e42742fd0f..4e439916dbcf 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -232,6 +232,9 @@ int msi_setup_device_data(struct device *dev); void msi_lock_descs(struct device *dev); void msi_unlock_descs(struct device *dev); +DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, msi_lock_descs(_T->lock), + msi_unlock_descs(_T->lock)); + struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); -- cgit v1.2.3 From 9357e329cdeb6f2d27b431a22d4965700bec478a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Mar 2025 11:57:01 +0100 Subject: genirq/msi: Rename msi_[un]lock_descs() Now that all abuse is gone and the legit users are converted to guard(msi_descs_lock), rename the lock functions and document them as internal. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/all/20250319105506.864699741@linutronix.de --- include/linux/msi.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 4e439916dbcf..8c0ec9fc05a3 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -229,11 +229,11 @@ struct msi_dev_domain { int msi_setup_device_data(struct device *dev); -void msi_lock_descs(struct device *dev); -void msi_unlock_descs(struct device *dev); +void __msi_lock_descs(struct device *dev); +void __msi_unlock_descs(struct device *dev); -DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, msi_lock_descs(_T->lock), - msi_unlock_descs(_T->lock)); +DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, __msi_lock_descs(_T->lock), + __msi_unlock_descs(_T->lock)); struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); -- cgit v1.2.3 From 6fec833b9d70c54ceacbf7d07665215fbd0cddef Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 28 Mar 2025 21:42:48 +0100 Subject: cpufreq: Add and use cpufreq policy locking guards Introduce "read" and "write" locking guards for cpufreq policies and use them where applicable in the cpufreq core. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Mario Limonciello Acked-by: Sudeep Holla Tested-by: Sudeep Holla Link: https://patch.msgid.link/8518682.T7Z3S40VBb@rjwysocki.net --- include/linux/cpufreq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 400fee6427a5..cb972d2aa8df 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -170,6 +170,12 @@ struct cpufreq_policy { struct notifier_block nb_max; }; +DEFINE_GUARD(cpufreq_policy_write, struct cpufreq_policy *, + down_write(&_T->rwsem), up_write(&_T->rwsem)) + +DEFINE_GUARD(cpufreq_policy_read, struct cpufreq_policy *, + down_read(&_T->rwsem), up_read(&_T->rwsem)) + /* * Used for passing new cpufreq policy data to the cpufreq driver's ->verify() * callback for sanitization. That callback is only expected to modify the min -- cgit v1.2.3 From c7282dce257480f0f22ed3db69cfb400a18709f4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 28 Mar 2025 21:45:38 +0100 Subject: cpufreq: Drop cpufreq_cpu_acquire() and cpufreq_cpu_release() Since cpufreq_cpu_acquire() and cpufreq_cpu_release() have no more users in the tree, remove them. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Mario Limonciello Acked-by: Sudeep Holla Tested-by: Sudeep Holla Link: https://patch.msgid.link/3880470.kQq0lBPeGt@rjwysocki.net --- include/linux/cpufreq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cb972d2aa8df..a33a094ef755 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -241,8 +241,6 @@ void disable_cpufreq(void); u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); -struct cpufreq_policy *cpufreq_cpu_acquire(unsigned int cpu); -void cpufreq_cpu_release(struct cpufreq_policy *policy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); void refresh_frequency_limits(struct cpufreq_policy *policy); void cpufreq_update_policy(unsigned int cpu); -- cgit v1.2.3 From eaff6b62d3439ca6ee00dba4f77673a8c37dac20 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 28 Mar 2025 21:48:54 +0100 Subject: cpufreq: Pass policy pointer to ->update_limits() Since cpufreq_update_limits() obtains a cpufreq policy pointer for the given CPU and reference counts the corresponding policy object, it may as well pass the policy pointer to the cpufreq driver's ->update_limits() callback which allows that callback to avoid invoking cpufreq_cpu_get() for the same CPU. Accordingly, redefine ->update_limits() to take a policy pointer instead of a CPU number and update both drivers implementing it, intel_pstate and amd-pstate, as needed. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Mario Limonciello Acked-by: Srinivas Pandruvada Acked-by: Sudeep Holla Tested-by: Sudeep Holla Link: https://patch.msgid.link/8560367.NyiUUSuA9g@rjwysocki.net --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index a33a094ef755..f3cf2adea18f 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -399,7 +399,7 @@ struct cpufreq_driver { unsigned int (*get)(unsigned int cpu); /* Called to update policy limits on firmware notifications. */ - void (*update_limits)(unsigned int cpu); + void (*update_limits)(struct cpufreq_policy *policy); /* optional */ int (*bios_limit)(int cpu, unsigned int *limit); -- cgit v1.2.3 From 855c634930f04c21434fae9c41a7a7372b6ac879 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Thu, 27 Mar 2025 12:07:08 +0100 Subject: PCI: Remove pcim_iounmap_regions() All users of the deprecated function pcim_iounmap_regions() have been ported by now. Remove it. Signed-off-by: Philipp Stanner Signed-off-by: Bjorn Helgaas Reviewed-by: Zijun Hu Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20250327110707.20025-4-phasta@kernel.org --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..a60fdb344d9e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2322,7 +2322,6 @@ void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr); void __iomem * const *pcim_iomap_table(struct pci_dev *pdev); int pcim_request_region(struct pci_dev *pdev, int bar, const char *name); int pcim_iomap_regions(struct pci_dev *pdev, int mask, const char *name); -void pcim_iounmap_regions(struct pci_dev *pdev, int mask); void __iomem *pcim_iomap_range(struct pci_dev *pdev, int bar, unsigned long offset, unsigned long len); -- cgit v1.2.3 From a82dc19db13649aa4232ce37cb6f4ceff851e2fe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:48 -0700 Subject: net: avoid potential race between netdev_get_by_index_lock() and netns switch netdev_get_by_index_lock() performs following steps: rcu_lock(); dev = lookup(netns, ifindex); dev_get(dev); rcu_unlock(); [... lock & validate the dev ...] return dev Validation right now only checks if the device is registered but since the lookup is netns-aware we must also protect against the device switching netns right after we dropped the RCU lock. Otherwise the caller in netns1 may get a pointer to a device which has just switched to netns2. We can't hold the lock for the entire netns change process (because of the NETDEV_UNREGISTER notifier), and there's no existing marking to indicate that the netns is unlisted because of netns move, so add one. AFAIU none of the existing netdev_get_by_index_lock() callers can suffer from this problem (NAPI code double checks the netns membership and other callers are either under rtnl_lock or not ns-sensitive), so this patch does not have to be treated as a fix. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf3b6445817b..8e9be80bc167 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1952,6 +1952,7 @@ enum netdev_reg_state { * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one * @nd_net: Network namespace this network device is inside + * protected by @lock * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type @@ -2359,6 +2360,9 @@ struct net_device { bool dismantle; + /** @moving_ns: device is changing netns, protected by @lock */ + bool moving_ns; + enum { RTNL_LINK_INITIALIZED, RTNL_LINK_INITIALIZING, @@ -2521,7 +2525,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up + * @up, @moving_ns, @nd_net * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues -- cgit v1.2.3 From 606048cbd8346e616cfaee01b0143d072534136d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:49 -0700 Subject: net: designate XSK pool pointers in queues as "ops protected" Read accesses go via xsk_get_pool_from_qid(), the call coming from the core and gve look safe (other "ops locked" drivers don't support XSK). Write accesses go via xsk_reg_pool_at_qid() and xsk_clear_pool_at_qid(). Former is already under the ops lock, latter is not (both coming from the workqueue via xp_clear_dev() and NETDEV_UNREGISTER via xsk_notifier()). Acked-by: Stanislav Fomichev Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8e9be80bc167..7242fb8a22fc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -688,6 +688,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS + /* "ops protected", see comment about net_device::lock */ struct xsk_buff_pool *pool; #endif -- cgit v1.2.3 From 03df156dd3a6d5992f17682cd5c3b11e5ffdae02 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:52 -0700 Subject: xdp: double protect netdev->xdp_flags with netdev->lock Protect xdp_features with netdev->lock. This way pure readers no longer have to take rtnl_lock to access the field. This includes calling NETDEV_XDP_FEAT_CHANGE under the lock. Looks like that's fine for bonding, the only "real" listener, it's the same as ethtool feature change. In terms of normal drivers - only GVE need special consideration (other drivers don't use instance lock or don't support XDP). It calls xdp_set_features_flag() helper from gve_init_priv() which in turn is called from gve_reset_recovery() (locked), or prior to netdev registration. So switch to _locked. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Acked-by: Harshitha Ramamurthy Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250408195956.412733-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7242fb8a22fc..dece2ae396a1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2526,7 +2526,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up, @moving_ns, @nd_net + * @up, @moving_ns, @nd_net, @xdp_flags * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues -- cgit v1.2.3 From 229671ac60e298b85c2644f52d7e487e9f487d06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Apr 2025 20:27:42 +0000 Subject: net: remove cpu stall in txq_trans_update() txq_trans_update() currently uses txq->xmit_lock_owner to conditionally update txq->trans_start. For regular devices, txq->xmit_lock_owner is updated from HARD_TX_LOCK() and HARD_TX_UNLOCK(), and this apparently causes cpu stalls. Using dev->lltx, which sits in a read-mostly cache-line, and already used in HARD_TX_LOCK() and HARD_TX_UNLOCK() helps cpu prediction. On an AMD EPYC 7B12 dual socket server, tcp_rr with 128 threads and 30,000 flows gets a 5 % increase in throughput. As explained in commit 95ecba62e2fd ("net: fix races in netdev_tx_sent_queue()/dev_watchdog()") I am planning to no longer update txq->trans_start in the fast path in a followup patch. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250408202742.2145516-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dece2ae396a1..a28a08046615 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4693,9 +4693,10 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) /* * txq->trans_start can be read locklessly from dev_watchdog() */ -static inline void txq_trans_update(struct netdev_queue *txq) +static inline void txq_trans_update(const struct net_device *dev, + struct netdev_queue *txq) { - if (txq->xmit_lock_owner != -1) + if (!dev->lltx) WRITE_ONCE(txq->trans_start, jiffies); } @@ -5214,7 +5215,7 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) - txq_trans_update(txq); + txq_trans_update(dev, txq); return rc; } -- cgit v1.2.3 From 8702048bb8313a21ec9179e9ce7ad3c5cb2ef072 Mon Sep 17 00:00:00 2001 From: Jocelyn Falempe Date: Mon, 7 Apr 2025 15:42:25 +0200 Subject: mm/kmap: Add kmap_local_page_try_from_panic() kmap_local_page() can be unsafe to call from a panic handler, if CONFIG_HIGHMEM is set, and the page is in the highmem zone. So add kmap_local_page_try_from_panic() to handle this case. Suggested-by: Simona Vetter Reviewed-by: Thomas Gleixner Signed-off-by: Jocelyn Falempe Link: https://lore.kernel.org/r/20250407140138.162383-2-jfalempe@redhat.com --- include/linux/highmem-internal.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index dd100e849f5e..9a7683d79a4b 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -73,6 +73,14 @@ static inline void *kmap_local_page(struct page *page) return __kmap_local_page_prot(page, kmap_prot); } +static inline void *kmap_local_page_try_from_panic(struct page *page) +{ + if (!PageHighMem(page)) + return page_address(page); + /* If the page is in HighMem, it's not safe to kmap it.*/ + return NULL; +} + static inline void *kmap_local_folio(struct folio *folio, size_t offset) { struct page *page = folio_page(folio, offset / PAGE_SIZE); @@ -180,6 +188,11 @@ static inline void *kmap_local_page(struct page *page) return page_address(page); } +static inline void *kmap_local_page_try_from_panic(struct page *page) +{ + return page_address(page); +} + static inline void *kmap_local_folio(struct folio *folio, size_t offset) { return page_address(&folio->page) + offset; -- cgit v1.2.3 From 1490cbb9dbfd0eabfe45f9b674097aea7e6760fc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 Mar 2025 16:54:51 +0200 Subject: device property: Split fwnode_get_child_node_count() The new helper is introduced to allow counting the child firmware nodes of their parent without requiring a device to be passed. This also makes the fwnode and device property API more symmetrical with the rest. Signed-off-by: Andy Shevchenko Acked-by: "Rafael J. Wysocki" Reviewed-by: Sakari Ailus Reviewed-by: Heikki Krogerus Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20250310150835.3139322-2-andriy.shevchenko@linux.intel.com Signed-off-by: Lee Jones --- include/linux/property.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index e214ecd241eb..bc5bfc98176b 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -208,7 +208,12 @@ DEFINE_FREE(fwnode_handle, struct fwnode_handle *, fwnode_handle_put(_T)) int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index); int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name); -unsigned int device_get_child_node_count(const struct device *dev); +unsigned int fwnode_get_child_node_count(const struct fwnode_handle *fwnode); + +static inline unsigned int device_get_child_node_count(const struct device *dev) +{ + return fwnode_get_child_node_count(dev_fwnode(dev)); +} static inline int device_property_read_u8(const struct device *dev, const char *propname, u8 *val) -- cgit v1.2.3 From 7e3711eb87c584ed224a7ad7000eba36e6fa3a51 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:53:55 +0100 Subject: fbdev: Track display blanking state Store the display's blank status in struct fb_info.blank and track it in fb_blank(). As an extra, the status is now available from the sysfs blank attribute. Support for blanking is optional. Therefore framebuffer_alloc() initializes the state to FB_BLANK_UNBLANK (i.e., the display is on). If the fb_blank callback has been set, register_framebuffer() sets the state to FB_BLANK_POWERDOWN. On the first modeset, the call to fb_blank() will update it to _UNBLANK. This is important, as listeners to FB_EVENT_BLANK will now see the display being switched on. Signed-off-by: Thomas Zimmermann Acked-by: Simona Vetter Link: https://lore.kernel.org/r/20250321095517.313713-3-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/fb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index cd653862ab99..348aa2e146e2 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -472,6 +472,8 @@ struct fb_info { struct list_head modelist; /* mode list */ struct fb_videomode *mode; /* current mode */ + int blank; /* current blanking; see FB_BLANK_ constants */ + #if IS_ENABLED(CONFIG_FB_BACKLIGHT) /* assigned backlight device */ /* set before framebuffer registration, -- cgit v1.2.3 From 726491f2038ec71122d45700f3abf36fdb277aaa Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:53:57 +0100 Subject: backlight: Implement fbdev tracking with blank state from event Look at the blank state provided by FB_EVENT_BLANK to determine whether to enable or disable a backlight. Remove the tracking fields from struct backlight_device. Tracking requires three variables, fb_on, prev_fb_on and the backlight's use_count. If fb_on is true, the display has been unblanked. The backlight needs to be enabled if the display was blanked before (i.e., prev_fb_on is false) or if use_count is still at 0. If fb_on is false, the display has been blanked. In this case, the backlight has to be disabled was unblanked before and the backlight's use_count is greater than 0. This change removes fbdev state tracking from blacklight. All the backlight requires it its own use counter and information about changes to the display. Removing fbdev internals makes backlight drivers easier to integrate into other display drivers, such as DRM. Signed-off-by: Thomas Zimmermann Reviewed-by: "Daniel Thompson (RISCstar)" Acked-by: Simona Vetter Link: https://lore.kernel.org/r/20250321095517.313713-5-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/backlight.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index f5652e5a9060..03723a5478f8 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -294,15 +294,7 @@ struct backlight_device { struct device dev; /** - * @fb_bl_on: The state of individual fbdev's. - * - * Multiple fbdev's may share one backlight device. The fb_bl_on - * records the state of the individual fbdev. - */ - bool fb_bl_on[FB_MAX]; - - /** - * @use_count: The number of uses of fb_bl_on. + * @use_count: The number of unblanked displays. */ int use_count; }; -- cgit v1.2.3 From b01beb2f1f6bcda17634a5b529865ffc5a9b795f Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:53:59 +0100 Subject: backlight: Replace fb events with a dedicated function call Remove support for fb events from backlight subsystem. Provide the helper backlight_notify_blank_all() instead. Also export the existing helper backlight_notify_blank() to update a single backlight device. In fbdev, call either helper to inform the backlight subsystem of changes to a display's blank state. If the framebuffer device has a specific backlight, only update this one; otherwise update all. v4: - protect blacklight declarations with IS_REACHABLE() (kernel test robot) v3: - declare empty fb_bl_notify_blank() as static inline (kernel test robot) Signed-off-by: Thomas Zimmermann Acked-by: Simona Vetter Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250321095517.313713-7-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/backlight.h | 22 ++++++++++++++++------ include/linux/fb.h | 4 ++++ 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 03723a5478f8..10e626db7eee 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -12,7 +12,6 @@ #include #include #include -#include #include /** @@ -278,11 +277,6 @@ struct backlight_device { */ const struct backlight_ops *ops; - /** - * @fb_notif: The framebuffer notifier block - */ - struct notifier_block fb_notif; - /** * @entry: List entry of all registered backlight devices */ @@ -400,6 +394,22 @@ struct backlight_device *backlight_device_get_by_type(enum backlight_type type); int backlight_device_set_brightness(struct backlight_device *bd, unsigned long brightness); +#if IS_REACHABLE(CONFIG_BACKLIGHT_CLASS_DEVICE) +void backlight_notify_blank(struct backlight_device *bd, + struct device *display_dev, + bool fb_on, bool prev_fb_on); +void backlight_notify_blank_all(struct device *display_dev, + bool fb_on, bool prev_fb_on); +#else +static inline void backlight_notify_blank(struct backlight_device *bd, + struct device *display_dev, + bool fb_on, bool prev_fb_on) +{ } +static inline void backlight_notify_blank_all(struct device *display_dev, + bool fb_on, bool prev_fb_on) +{ } +#endif + #define to_backlight_device(obj) container_of(obj, struct backlight_device, dev) /** diff --git a/include/linux/fb.h b/include/linux/fb.h index 348aa2e146e2..835e13d6eba8 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -758,11 +758,15 @@ extern void fb_bl_default_curve(struct fb_info *fb_info, u8 off, u8 min, u8 max) #if IS_ENABLED(CONFIG_FB_BACKLIGHT) struct backlight_device *fb_bl_device(struct fb_info *info); +void fb_bl_notify_blank(struct fb_info *info, int old_blank); #else static inline struct backlight_device *fb_bl_device(struct fb_info *info) { return NULL; } + +static inline void fb_bl_notify_blank(struct fb_info *info, int old_blank) +{ } #endif static inline struct lcd_device *fb_lcd_device(struct fb_info *info) -- cgit v1.2.3 From bc70cc84f5a2ebfd7e7112e9b8837e0c7954fc65 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:54:01 +0100 Subject: backlight: lcd: Replace fb events with a dedicated function call Remove support for fb events from the lcd subsystem. Provide the helper lcd_notify_blank_all() instead. In fbdev, call lcd_notify_blank_all() to inform the lcd subsystem of changes to a display's blank state. Fbdev maintains a list of all installed notifiers. Instead of fbdev notifiers, maintain an internal list of lcd devices. v3: - export lcd_notify_mode_change_all() (kernel test robot) v2: - maintain global list of lcd devices - avoid IS_REACHABLE() in source file - use lock guards - initialize lcd list and list mutex Signed-off-by: Thomas Zimmermann Acked-by: Simona Vetter Reviewed-by: "Daniel Thompson (RISCstar)" Link: https://lore.kernel.org/r/20250321095517.313713-9-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/lcd.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lcd.h b/include/linux/lcd.h index c3ccdff4519a..d4fa03722b72 100644 --- a/include/linux/lcd.h +++ b/include/linux/lcd.h @@ -11,7 +11,6 @@ #include #include -#include #define LCD_POWER_ON (0) #define LCD_POWER_REDUCED (1) // deprecated; don't use in new code @@ -79,8 +78,11 @@ struct lcd_device { const struct lcd_ops *ops; /* Serialise access to set_power method */ struct mutex update_lock; - /* The framebuffer notifier block */ - struct notifier_block fb_notif; + + /** + * @entry: List entry of all registered lcd devices + */ + struct list_head entry; struct device dev; }; @@ -125,6 +127,19 @@ extern void lcd_device_unregister(struct lcd_device *ld); extern void devm_lcd_device_unregister(struct device *dev, struct lcd_device *ld); +#if IS_REACHABLE(CONFIG_LCD_CLASS_DEVICE) +void lcd_notify_blank_all(struct device *display_dev, int power); +void lcd_notify_mode_change_all(struct device *display_dev, + unsigned int width, unsigned int height); +#else +static inline void lcd_notify_blank_all(struct device *display_dev, int power) +{} + +static inline void lcd_notify_mode_change_all(struct device *display_dev, + unsigned int width, unsigned int height) +{} +#endif + #define to_lcd_device(obj) container_of(obj, struct lcd_device, dev) static inline void * lcd_get_data(struct lcd_device *ld_dev) -- cgit v1.2.3 From dc2139c0aa3283e5749109641af1878ed7bf7371 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:54:03 +0100 Subject: leds: backlight trigger: Replace fb events with a dedicated function call Remove support for fb events from the led backlight trigger. Provide the helper ledtrig_backlight_blank() instead. Call it from fbdev to inform the trigger of changes to a display's blank state. Fbdev maintains a list of all installed notifiers. Instead of the fbdev notifiers, maintain an internal list of led backlight triggers. v3: - export ledtrig_backlight_blank() v2: - maintain global list of led backlight triggers (Lee) - avoid IS_REACHABLE() in source file (Lee) - notify on changes to blank state instead of display state - use lock guards - initialize led list and list mutex Signed-off-by: Thomas Zimmermann Acked-by: Simona Vetter Link: https://lore.kernel.org/r/20250321095517.313713-11-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/leds.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 98f9719c924c..b3f0aa081064 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -640,6 +640,12 @@ static inline void ledtrig_flash_ctrl(bool on) {} static inline void ledtrig_torch_ctrl(bool on) {} #endif +#if IS_REACHABLE(CONFIG_LEDS_TRIGGER_BACKLIGHT) +void ledtrig_backlight_blank(bool blank); +#else +static inline void ledtrig_backlight_blank(bool blank) {} +#endif + /* * Generic LED platform data for describing LED names and default triggers. */ -- cgit v1.2.3 From d32a0b567a8a8b6e677d35c4f8eb8bd32b7029a0 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 21 Mar 2025 10:54:04 +0100 Subject: fbdev: Remove constants of unused events The constants FB_EVENT_MODE_CHANGE and FB_EVENT_BLANK are unused. Remove them from the header file. Signed-off-by: Thomas Zimmermann Acked-by: Simona Vetter Link: https://lore.kernel.org/r/20250321095517.313713-12-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/fb.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 835e13d6eba8..05cc251035da 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -129,18 +129,12 @@ struct fb_cursor_user { * Register/unregister for framebuffer events */ -/* The resolution of the passed in fb_info about to change */ -#define FB_EVENT_MODE_CHANGE 0x01 - #ifdef CONFIG_GUMSTIX_AM200EPD /* only used by mach-pxa/am200epd.c */ #define FB_EVENT_FB_REGISTERED 0x05 #define FB_EVENT_FB_UNREGISTERED 0x06 #endif -/* A display blank is requested */ -#define FB_EVENT_BLANK 0x09 - struct fb_event { struct fb_info *info; void *data; -- cgit v1.2.3 From 589a7c406a721f5d3a818ad0003799180f027dfa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Apr 2025 12:20:43 +0200 Subject: cpufreq: Drop unused cpufreq_get_policy() A recent change has introduced a bug into cpufreq_get_policy(), but this function is not used, so it's better to drop it altogether. Reported-by: Dan Carpenter Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Acked-by: Sudeep Holla Link: https://patch.msgid.link/2802770.mvXUDI8C0e@rjwysocki.net --- include/linux/cpufreq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index f3cf2adea18f..850fe7371cb1 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -241,7 +241,6 @@ void disable_cpufreq(void); u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); -int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); void refresh_frequency_limits(struct cpufreq_policy *policy); void cpufreq_update_policy(unsigned int cpu); void cpufreq_update_limits(unsigned int cpu); -- cgit v1.2.3 From 588d5d20ca8defa5ba5d1b536ff3695f6ab7aa87 Mon Sep 17 00:00:00 2001 From: Kaustabh Chakraborty Date: Thu, 10 Apr 2025 14:01:14 +0530 Subject: phy: exynos5-usbdrd: add exynos7870 USBDRD support Implement support for Exynos7870 USB DRD on top of the existing exynos5-usbdrd driver. Exynos7870 has a single USB 2.0 DRD PHY controller and no 3.0 PHYs. Thus, it only supports the UTMI interface. Moreover, the PMU register offset for enabling the PHY controller is different for SoCs such as Exynos7870, where BIT(0) is for the 3.0 PHY and BIT(1) is for the 2.0 PHY. The phy_isol function for Exynos7870 uses the appropriate register offsets. Signed-off-by: Kaustabh Chakraborty Link: https://lore.kernel.org/r/20250410-exynos7870-usbphy-v2-3-2eb005987455@disroot.org Signed-off-by: Vinod Koul --- include/linux/soc/samsung/exynos-regs-pmu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index ce1a3790d6fb..cde299a85384 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -55,6 +55,8 @@ #define EXYNOS4_MIPI_PHY_SRESETN (1 << 1) #define EXYNOS4_MIPI_PHY_MRESETN (1 << 2) #define EXYNOS4_MIPI_PHY_RESET_MASK (3 << 1) +/* USB PHY enable bit, valid for Exynos7870 */ +#define EXYNOS7870_USB2PHY_ENABLE (1 << 1) #define S5P_INFORM0 0x0800 #define S5P_INFORM1 0x0804 -- cgit v1.2.3 From b2849b0723668ae3e0739b2c9c066eafe8bc0961 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 3 Apr 2025 12:09:40 +0200 Subject: svsm: Add header with SVSM_VTPM_CMD helpers Add helpers for the SVSM_VTPM_CMD calls used by the vTPM protocol defined by the AMD SVSM spec [1]. The vTPM protocol follows the Official TPM 2.0 Reference Implementation (originally by Microsoft, now part of the TCG) simulator protocol. [1] "Secure VM Service Module for SEV-SNP Guests" Publication # 58019 Revision: 1.00 Co-developed-by: James Bottomley Signed-off-by: James Bottomley Co-developed-by: Claudio Carvalho Signed-off-by: Claudio Carvalho Signed-off-by: Stefano Garzarella Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20250403100943.120738-3-sgarzare@redhat.com --- include/linux/tpm_svsm.h | 149 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 include/linux/tpm_svsm.h (limited to 'include/linux') diff --git a/include/linux/tpm_svsm.h b/include/linux/tpm_svsm.h new file mode 100644 index 000000000000..38e341f9761a --- /dev/null +++ b/include/linux/tpm_svsm.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 James.Bottomley@HansenPartnership.com + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * + * Helpers for the SVSM_VTPM_CMD calls used by the vTPM protocol defined by the + * AMD SVSM spec [1]. + * + * The vTPM protocol follows the Official TPM 2.0 Reference Implementation + * (originally by Microsoft, now part of the TCG) simulator protocol. + * + * [1] "Secure VM Service Module for SEV-SNP Guests" + * Publication # 58019 Revision: 1.00 + */ +#ifndef _TPM_SVSM_H_ +#define _TPM_SVSM_H_ + +#include +#include +#include + +#define SVSM_VTPM_MAX_BUFFER 4096 /* max req/resp buffer size */ + +/** + * struct svsm_vtpm_request - Generic request for single word command + * @cmd: The command to send + * + * Defined by AMD SVSM spec [1] in section "8.2 SVSM_VTPM_CMD Call" - + * Table 15: vTPM Common Request/Response Structure + * Byte Size     In/Out    Description + * Offset    (Bytes) + * 0x000     4          In        Platform command + *                         Out       Platform command response size + */ +struct svsm_vtpm_request { + u32 cmd; +}; + +/** + * struct svsm_vtpm_response - Generic response + * @size: The response size (zero if nothing follows) + * + * Defined by AMD SVSM spec [1] in section "8.2 SVSM_VTPM_CMD Call" - + * Table 15: vTPM Common Request/Response Structure + * Byte Size     In/Out    Description + * Offset    (Bytes) + * 0x000     4          In        Platform command + *                         Out       Platform command response size + * + * Note: most TCG Simulator commands simply return zero here with no indication + * of success or failure. + */ +struct svsm_vtpm_response { + u32 size; +}; + +/** + * struct svsm_vtpm_cmd_request - Structure for a TPM_SEND_COMMAND request + * @cmd: The command to send (must be TPM_SEND_COMMAND) + * @locality: The locality + * @buf_size: The size of the input buffer following + * @buf: A buffer of size buf_size + * + * Defined by AMD SVSM spec [1] in section "8.2 SVSM_VTPM_CMD Call" - + * Table 16: TPM_SEND_COMMAND Request Structure + * Byte Size Meaning + * Offset    (Bytes) + * 0x000     4          Platform command (8) + * 0x004     1          Locality (must-be-0) + * 0x005     4          TPM Command size (in bytes) + * 0x009     Variable   TPM Command + * + * Note: the TCG Simulator expects @buf_size to be equal to the size of the + * specific TPM command, otherwise an TPM_RC_COMMAND_SIZE error is returned. + */ +struct svsm_vtpm_cmd_request { + u32 cmd; + u8 locality; + u32 buf_size; + u8 buf[]; +} __packed; + +/** + * struct svsm_vtpm_cmd_response - Structure for a TPM_SEND_COMMAND response + * @buf_size: The size of the output buffer following + * @buf: A buffer of size buf_size + * + * Defined by AMD SVSM spec [1] in section "8.2 SVSM_VTPM_CMD Call" - + * Table 17: TPM_SEND_COMMAND Response Structure + * Byte Size Meaning + * Offset    (Bytes) + * 0x000     4          Response size (in bytes) + * 0x004     Variable   Response + */ +struct svsm_vtpm_cmd_response { + u32 buf_size; + u8 buf[]; +}; + +/** + * svsm_vtpm_cmd_request_fill() - Fill a TPM_SEND_COMMAND request to be sent to SVSM + * @req: The struct svsm_vtpm_cmd_request to fill + * @locality: The locality + * @buf: The buffer from where to copy the payload of the command + * @len: The size of the buffer + * + * Return: 0 on success, negative error code on failure. + */ +static inline int +svsm_vtpm_cmd_request_fill(struct svsm_vtpm_cmd_request *req, u8 locality, + const u8 *buf, size_t len) +{ + if (len > SVSM_VTPM_MAX_BUFFER - sizeof(*req)) + return -EINVAL; + + req->cmd = 8; /* TPM_SEND_COMMAND */ + req->locality = locality; + req->buf_size = len; + + memcpy(req->buf, buf, len); + + return 0; +} + +/** + * svsm_vtpm_cmd_response_parse() - Parse a TPM_SEND_COMMAND response received from SVSM + * @resp: The struct svsm_vtpm_cmd_response to parse + * @buf: The buffer where to copy the response + * @len: The size of the buffer + * + * Return: buffer size filled with the response on success, negative error + * code on failure. + */ +static inline int +svsm_vtpm_cmd_response_parse(const struct svsm_vtpm_cmd_response *resp, u8 *buf, + size_t len) +{ + if (len < resp->buf_size) + return -E2BIG; + + if (resp->buf_size > SVSM_VTPM_MAX_BUFFER - sizeof(*resp)) + return -EINVAL; // Invalid response from the platform TPM + + memcpy(buf, resp->buf, resp->buf_size); + + return resp->buf_size; +} + +#endif /* _TPM_SVSM_H_ */ -- cgit v1.2.3 From 74a70e80daa993445a8f0be817f8152a3b9d3b41 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 9 Apr 2025 22:43:10 +0200 Subject: PCI: Remove pci_fixup_cardbus() Since 1c7f4fe86f17 ("powerpc/pci: Remove pcibios_setup_bus_devices()") there's no architecture left setting pci_fixup_cardbus. Therefore remove support from PCI core. Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/8de7da4c-2b16-4ee1-8c42-0d04f3c821c6@gmail.com --- include/linux/pci.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..d26e6611bd00 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1139,9 +1139,6 @@ resource_size_t pcibios_align_resource(void *, const struct resource *, resource_size_t, resource_size_t); -/* Weak but can be overridden by arch */ -void pci_fixup_cardbus(struct pci_bus *); - /* Generic PCI functions used internally */ void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region, -- cgit v1.2.3 From b1e904999542ad6764eafa54545f1c55776006d1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Apr 2025 11:32:01 -0700 Subject: net: pass const to msg_data_left() The msg_data_left() function doesn't modify the struct msghdr parameter, so mark it as const. This allows the function to be used with const references, improving type safety and making the API more flexible. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250408-tcpsendmsg-v3-1-208b87064c28@debian.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index c3322eb3d686..3b262487ec06 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -168,7 +168,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } -static inline size_t msg_data_left(struct msghdr *msg) +static inline size_t msg_data_left(const struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } -- cgit v1.2.3 From de66754e9f8029f8ae955a588959b99cab56b506 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Wed, 9 Apr 2025 12:47:34 -0700 Subject: xhci: sideband: add initial api to register a secondary interrupter entity Introduce XHCI sideband, which manages the USB endpoints being requested by a client driver. This is used for when client drivers are attempting to offload USB endpoints to another entity for handling USB transfers. XHCI sec intr will allow for drivers to fetch the required information about the transfer ring, so the user can submit transfers independently. Expose the required APIs for drivers to register and request for a USB endpoint and to manage XHCI secondary interrupters. Driver renaming, multiple ring segment page linking, proper endpoint clean up, and allowing module compilation added by Wesley Cheng to complete original concept code by Mathias Nyman. Tested-by: Puma Hsu Tested-by: Daehwan Jung Signed-off-by: Mathias Nyman Co-developed-by: Wesley Cheng Signed-off-by: Wesley Cheng Acked-by: Mark Brown Link: https://lore.kernel.org/r/20250409194804.3773260-2-quic_wcheng@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/xhci-sideband.h | 74 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 include/linux/usb/xhci-sideband.h (limited to 'include/linux') diff --git a/include/linux/usb/xhci-sideband.h b/include/linux/usb/xhci-sideband.h new file mode 100644 index 000000000000..4b382af892fa --- /dev/null +++ b/include/linux/usb/xhci-sideband.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * xHCI host controller sideband support + * + * Copyright (c) 2023-2025, Intel Corporation. + * + * Author: Mathias Nyman + */ +#ifndef __LINUX_XHCI_SIDEBAND_H +#define __LINUX_XHCI_SIDEBAND_H + +#include +#include + +#define EP_CTX_PER_DEV 31 /* FIXME defined twice, from xhci.h */ + +struct xhci_sideband; + +enum xhci_sideband_type { + XHCI_SIDEBAND_AUDIO, + XHCI_SIDEBAND_VENDOR, +}; + +/** + * struct xhci_sideband - representation of a sideband accessed usb device. + * @xhci: The xhci host controller the usb device is connected to + * @vdev: the usb device accessed via sideband + * @eps: array of endpoints controlled via sideband + * @ir: event handling and buffer for sideband accessed device + * @type: xHCI sideband type + * @mutex: mutex for sideband operations + * @intf: USB sideband client interface + * + * FIXME usb device accessed via sideband Keeping track of sideband accessed usb devices. + */ +struct xhci_sideband { + struct xhci_hcd *xhci; + struct xhci_virt_device *vdev; + struct xhci_virt_ep *eps[EP_CTX_PER_DEV]; + struct xhci_interrupter *ir; + enum xhci_sideband_type type; + + /* Synchronizing xHCI sideband operations with client drivers operations */ + struct mutex mutex; + + struct usb_interface *intf; +}; + +struct xhci_sideband * +xhci_sideband_register(struct usb_interface *intf, enum xhci_sideband_type type); +void +xhci_sideband_unregister(struct xhci_sideband *sb); +int +xhci_sideband_add_endpoint(struct xhci_sideband *sb, + struct usb_host_endpoint *host_ep); +int +xhci_sideband_remove_endpoint(struct xhci_sideband *sb, + struct usb_host_endpoint *host_ep); +int +xhci_sideband_stop_endpoint(struct xhci_sideband *sb, + struct usb_host_endpoint *host_ep); +struct sg_table * +xhci_sideband_get_endpoint_buffer(struct xhci_sideband *sb, + struct usb_host_endpoint *host_ep); +struct sg_table * +xhci_sideband_get_event_buffer(struct xhci_sideband *sb); +int +xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg, + bool ip_autoclear, u32 imod_interval); +void +xhci_sideband_remove_interrupter(struct xhci_sideband *sb); +int +xhci_sideband_interrupter_id(struct xhci_sideband *sb); +#endif /* __LINUX_XHCI_SIDEBAND_H */ -- cgit v1.2.3 From fce57295497df70711d50ed01bf6d914de0ca647 Mon Sep 17 00:00:00 2001 From: Wesley Cheng Date: Wed, 9 Apr 2025 12:47:36 -0700 Subject: usb: host: xhci-mem: Allow for interrupter clients to choose specific index Some clients may operate only on a specific XHCI interrupter instance. Allow for the associated class driver to request for the interrupter that it requires. Tested-by: Puma Hsu Tested-by: Daehwan Jung Signed-off-by: Wesley Cheng Acked-by: Mark Brown Link: https://lore.kernel.org/r/20250409194804.3773260-4-quic_wcheng@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/xhci-sideband.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/xhci-sideband.h b/include/linux/usb/xhci-sideband.h index 4b382af892fa..f8722afb8a2d 100644 --- a/include/linux/usb/xhci-sideband.h +++ b/include/linux/usb/xhci-sideband.h @@ -66,7 +66,7 @@ struct sg_table * xhci_sideband_get_event_buffer(struct xhci_sideband *sb); int xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg, - bool ip_autoclear, u32 imod_interval); + bool ip_autoclear, u32 imod_interval, int intr_num); void xhci_sideband_remove_interrupter(struct xhci_sideband *sb); int -- cgit v1.2.3 From b85a2ebda1034881d09e6127726dc78950669474 Mon Sep 17 00:00:00 2001 From: Wesley Cheng Date: Wed, 9 Apr 2025 12:47:38 -0700 Subject: usb: host: xhci: Notify xHCI sideband on transfer ring free In the case of handling a USB bus reset, the xhci_discover_or_reset_device can run without first notifying the xHCI sideband client driver to stop or prevent the use of the transfer ring. It was seen that when a bus reset situation happened, the USB offload driver was attempting to fetch the xHCI transfer ring information, which was already freed. Tested-by: Puma Hsu Tested-by: Daehwan Jung Signed-off-by: Wesley Cheng Acked-by: Mark Brown Link: https://lore.kernel.org/r/20250409194804.3773260-6-quic_wcheng@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/xhci-sideband.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/xhci-sideband.h b/include/linux/usb/xhci-sideband.h index f8722afb8a2d..45288c392f6e 100644 --- a/include/linux/usb/xhci-sideband.h +++ b/include/linux/usb/xhci-sideband.h @@ -21,6 +21,20 @@ enum xhci_sideband_type { XHCI_SIDEBAND_VENDOR, }; +enum xhci_sideband_notify_type { + XHCI_SIDEBAND_XFER_RING_FREE, +}; + +/** + * struct xhci_sideband_event - sideband event + * @type: notifier type + * @evt_data: event data + */ +struct xhci_sideband_event { + enum xhci_sideband_notify_type type; + void *evt_data; +}; + /** * struct xhci_sideband - representation of a sideband accessed usb device. * @xhci: The xhci host controller the usb device is connected to @@ -30,6 +44,7 @@ enum xhci_sideband_type { * @type: xHCI sideband type * @mutex: mutex for sideband operations * @intf: USB sideband client interface + * @notify_client: callback for xHCI sideband sequences * * FIXME usb device accessed via sideband Keeping track of sideband accessed usb devices. */ @@ -44,10 +59,14 @@ struct xhci_sideband { struct mutex mutex; struct usb_interface *intf; + int (*notify_client)(struct usb_interface *intf, + struct xhci_sideband_event *evt); }; struct xhci_sideband * -xhci_sideband_register(struct usb_interface *intf, enum xhci_sideband_type type); +xhci_sideband_register(struct usb_interface *intf, enum xhci_sideband_type type, + int (*notify_client)(struct usb_interface *intf, + struct xhci_sideband_event *evt)); void xhci_sideband_unregister(struct xhci_sideband *sb); int @@ -71,4 +90,13 @@ void xhci_sideband_remove_interrupter(struct xhci_sideband *sb); int xhci_sideband_interrupter_id(struct xhci_sideband *sb); + +#if IS_ENABLED(CONFIG_USB_XHCI_SIDEBAND) +void xhci_sideband_notify_ep_ring_free(struct xhci_sideband *sb, + unsigned int ep_index); +#else +static inline void xhci_sideband_notify_ep_ring_free(struct xhci_sideband *sb, + unsigned int ep_index) +{ } +#endif /* IS_ENABLED(CONFIG_USB_XHCI_SIDEBAND) */ #endif /* __LINUX_XHCI_SIDEBAND_H */ -- cgit v1.2.3 From 67890d579402804b1d32b3280d9860073542528e Mon Sep 17 00:00:00 2001 From: Wesley Cheng Date: Wed, 9 Apr 2025 12:47:40 -0700 Subject: ALSA: Add USB audio device jack type Add an USB jack type, in order to support notifying of a valid USB audio device. Since USB audio devices can have a slew of different configurations that reach beyond the basic headset and headphone use cases, classify these devices differently. Reviewed-by: Pierre-Louis Bossart Signed-off-by: Wesley Cheng Acked-by: Mark Brown Link: https://lore.kernel.org/r/20250409194804.3773260-8-quic_wcheng@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index bd7e60c0b72f..dde836875deb 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -340,7 +340,7 @@ struct pcmcia_device_id { #define INPUT_DEVICE_ID_LED_MAX 0x0f #define INPUT_DEVICE_ID_SND_MAX 0x07 #define INPUT_DEVICE_ID_FF_MAX 0x7f -#define INPUT_DEVICE_ID_SW_MAX 0x10 +#define INPUT_DEVICE_ID_SW_MAX 0x11 #define INPUT_DEVICE_ID_PROP_MAX 0x1f #define INPUT_DEVICE_ID_MATCH_BUS 1 -- cgit v1.2.3 From 10ed34d6eaaf86e301a8f2dd190d26dfbc9799bd Mon Sep 17 00:00:00 2001 From: Sandor Yu Date: Tue, 18 Mar 2025 14:35:35 +0200 Subject: phy: Add HDMI configuration options Allow HDMI PHYs to be configured through the generic functions through a custom structure added to the generic union. The parameters added here are based on HDMI PHY implementation practices. The current set of parameters should cover the potential users. Signed-off-by: Sandor Yu Reviewed-by: Dmitry Baryshkov Reviewed-by: Maxime Ripard Acked-by: Vinod Koul Link: https://lore.kernel.org/r/d1cff6c03ec3732d2244022029245ab2d954d997.1734340233.git.Sandor.yu@nxp.com Signed-off-by: Cristian Ciocaltea Link: https://lore.kernel.org/r/20250318-phy-sam-hdptx-bpc-v6-1-8cb1678e7663@collabora.com Signed-off-by: Vinod Koul --- include/linux/phy/phy-hdmi.h | 19 +++++++++++++++++++ include/linux/phy/phy.h | 7 ++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 include/linux/phy/phy-hdmi.h (limited to 'include/linux') diff --git a/include/linux/phy/phy-hdmi.h b/include/linux/phy/phy-hdmi.h new file mode 100644 index 000000000000..6a696922bc7f --- /dev/null +++ b/include/linux/phy/phy-hdmi.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2022,2024 NXP + */ + +#ifndef __PHY_HDMI_H_ +#define __PHY_HDMI_H_ + +/** + * struct phy_configure_opts_hdmi - HDMI configuration set + * @tmds_char_rate: HDMI TMDS Character Rate in Hertz. + * + * This structure is used to represent the configuration state of a HDMI phy. + */ +struct phy_configure_opts_hdmi { + unsigned long long tmds_char_rate; +}; + +#endif /* __PHY_HDMI_H_ */ diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index e63e6e70e860..437769e061b7 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -42,7 +43,8 @@ enum phy_mode { PHY_MODE_MIPI_DPHY, PHY_MODE_SATA, PHY_MODE_LVDS, - PHY_MODE_DP + PHY_MODE_DP, + PHY_MODE_HDMI, }; enum phy_media { @@ -60,11 +62,14 @@ enum phy_media { * the DisplayPort protocol. * @lvds: Configuration set applicable for phys supporting * the LVDS phy mode. + * @hdmi: Configuration set applicable for phys supporting + * the HDMI phy mode. */ union phy_configure_opts { struct phy_configure_opts_mipi_dphy mipi_dphy; struct phy_configure_opts_dp dp; struct phy_configure_opts_lvds lvds; + struct phy_configure_opts_hdmi hdmi; }; /** -- cgit v1.2.3 From 3bb9286f4ece6acbc1fbaa9f192a82645d30efbf Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Tue, 18 Mar 2025 14:35:36 +0200 Subject: phy: hdmi: Add color depth configuration Extend the HDMI configuration options to allow managing bits per color channel. This is required by some PHY drivers such as rockchip-samsung-hdptx. Reviewed-by: Dmitry Baryshkov Signed-off-by: Cristian Ciocaltea Link: https://lore.kernel.org/r/20250318-phy-sam-hdptx-bpc-v6-2-8cb1678e7663@collabora.com Signed-off-by: Vinod Koul --- include/linux/phy/phy-hdmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy/phy-hdmi.h b/include/linux/phy/phy-hdmi.h index 6a696922bc7f..f0ec963c6e84 100644 --- a/include/linux/phy/phy-hdmi.h +++ b/include/linux/phy/phy-hdmi.h @@ -9,11 +9,13 @@ /** * struct phy_configure_opts_hdmi - HDMI configuration set * @tmds_char_rate: HDMI TMDS Character Rate in Hertz. + * @bpc: Bits per color channel. * * This structure is used to represent the configuration state of a HDMI phy. */ struct phy_configure_opts_hdmi { unsigned long long tmds_char_rate; + unsigned int bpc; }; #endif /* __PHY_HDMI_H_ */ -- cgit v1.2.3 From 1692632146451184c4bcb68554098470a119fb01 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 8 Apr 2025 20:08:51 +0800 Subject: USB: core: Correct API usb_(enable|disable)_autosuspend() prototypes API usb_(enable|disable)_autosuspend() have inconsistent prototypes regarding if CONFIG_PM is defined. Correct prototypes when the macro is undefined by referring to those when the macro is defined. Signed-off-by: Zijun Hu Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20250408-fix_usb_hdr-v1-1-e785c5b49481@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index b46738701f8d..1b2545b4363b 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -815,10 +815,10 @@ static inline void usb_mark_last_busy(struct usb_device *udev) #else -static inline int usb_enable_autosuspend(struct usb_device *udev) -{ return 0; } -static inline int usb_disable_autosuspend(struct usb_device *udev) -{ return 0; } +static inline void usb_enable_autosuspend(struct usb_device *udev) +{ } +static inline void usb_disable_autosuspend(struct usb_device *udev) +{ } static inline int usb_autopm_get_interface(struct usb_interface *intf) { return 0; } -- cgit v1.2.3 From 2acaf27cd7f4f32bfe8bf7335690618e2417e744 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 9 Apr 2025 21:13:54 -0400 Subject: vt: move unicode processing to a separate file This will make it easier to maintain. Also make it depend on CONFIG_CONSOLE_TRANSLATIONS. Signed-off-by: Nicolas Pitre Link: https://lore.kernel.org/r/20250410011839.64418-3-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index c35db4896c37..caf079bcb8c9 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -28,6 +28,7 @@ int conv_uni_to_pc(struct vc_data *conp, long ucs); u32 conv_8bit_to_uni(unsigned char c); int conv_uni_to_8bit(u32 uni); void console_map_init(void); +bool ucs_is_double_width(uint32_t cp); #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -57,6 +58,11 @@ static inline int conv_uni_to_8bit(u32 uni) } static inline void console_map_init(void) { } + +static inline bool ucs_is_double_width(uint32_t cp) +{ + return false; +} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From e88391f730e46d208b7fb37b02611d24137af1ef Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 9 Apr 2025 21:13:55 -0400 Subject: vt: properly support zero-width Unicode code points Zero-width Unicode code points are causing misalignment in vertically aligned content, disrupting the visual layout. Let's handle zero-width code points more intelligently. Double-width code points are stored in the screen grid followed by a white space code point to create the expected screen layout. When a double-width code point is followed by a zero-width code point in the console incoming bytestream (e.g., an emoji with a presentation selector) then we may replace the white space padding by that zero-width code point instead of dropping it. This maximize screen content information while preserving proper layout. If a zero-width code point is preceded by a single-width code point then the above trick is not possible and such zero-width code point must be dropped. VS16 (Variation Selector 16, U+FE0F) is special as it doubles the width of the preceding single-width code point. We handle that case by giving VS16 a width of 1 when that happens. Signed-off-by: Nicolas Pitre Link: https://lore.kernel.org/r/20250410011839.64418-4-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index caf079bcb8c9..7d778752dcef 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -29,6 +29,11 @@ u32 conv_8bit_to_uni(unsigned char c); int conv_uni_to_8bit(u32 uni); void console_map_init(void); bool ucs_is_double_width(uint32_t cp); +static inline bool ucs_is_zero_width(uint32_t cp) +{ + /* coming soon */ + return false; +} #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -63,6 +68,11 @@ static inline bool ucs_is_double_width(uint32_t cp) { return false; } + +static inline bool ucs_is_zero_width(uint32_t cp) +{ + return false; +} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From 3a1ab63aa05b4736a7d30ae0a769385662f13def Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 9 Apr 2025 21:13:57 -0400 Subject: vt: update ucs_width.c using gen_ucs_width.py This replaces ucs_width.c with the code generated by gen_ucs_width.py providing comprehensive tables for double-width and zero-width Unicode code points. Also make ucs_is_zero_width() effective. Note: scripts/checkpatch.pl complains about "... exceeds 100 columns". Please ignore. Signed-off-by: Nicolas Pitre Link: https://lore.kernel.org/r/20250410011839.64418-6-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index 7d778752dcef..b3a911866662 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -29,11 +29,7 @@ u32 conv_8bit_to_uni(unsigned char c); int conv_uni_to_8bit(u32 uni); void console_map_init(void); bool ucs_is_double_width(uint32_t cp); -static inline bool ucs_is_zero_width(uint32_t cp) -{ - /* coming soon */ - return false; -} +bool ucs_is_zero_width(uint32_t cp); #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) -- cgit v1.2.3 From 54af55b990eda5a6a0140a3cded8094b42c0c3b7 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 9 Apr 2025 21:13:59 -0400 Subject: vt: create ucs_recompose.c using gen_ucs_recompose.py This provides ucs_recompose() to recompose two Unicode characters into a single character if possible. This is needed for the VT to properly display decomposed UTF8 sequences. Note: scripts/checkpatch.pl complains about "... exceeds 100 columns". Please ignore. Signed-off-by: Nicolas Pitre Link: https://lore.kernel.org/r/20250410011839.64418-8-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index b3a911866662..4d3a34c288e5 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -30,6 +30,7 @@ int conv_uni_to_8bit(u32 uni); void console_map_init(void); bool ucs_is_double_width(uint32_t cp); bool ucs_is_zero_width(uint32_t cp); +uint32_t ucs_recompose(uint32_t base, uint32_t combining); #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -69,6 +70,11 @@ static inline bool ucs_is_zero_width(uint32_t cp) { return false; } + +static inline uint32_t ucs_recompose(uint32_t base, uint32_t combining) +{ + return 0; +} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From 2a63dd0edf388802074f1d4d6b588a3b4c380688 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:45 -0700 Subject: net: Retire DCCP socket. DCCP was orphaned in 2021 by commit 054c4610bd05 ("MAINTAINERS: dccp: move Gerrit Renker to CREDITS"), which noted that the last maintainer had been inactive for five years. In recent years, it has become a playground for syzbot, and most changes to DCCP have been odd bug fixes triggered by syzbot. Apart from that, the only changes have been driven by treewide or networking API updates or adjustments related to TCP. Thus, in 2023, we announced we would remove DCCP in 2025 via commit b144fcaf46d4 ("dccp: Print deprecation notice."). Since then, only one individual has contacted the netdev mailing list. [0] There is ongoing research for Multipath DCCP. The repository is hosted on GitHub [1], and development is not taking place through the upstream community. While the repository is published under the GPLv2 license, the scheduling part remains proprietary, with a LICENSE file [2] stating: "This is not Open Source software." The researcher mentioned a plan to address the licensing issue, upstream the patches, and step up as a maintainer, but there has been no further communication since then. Maintaining DCCP for a decade without any real users has become a burden. Therefore, it's time to remove it. Removing DCCP will also provide significant benefits to TCP. It allows us to freely reorganize the layout of struct inet_connection_sock, which is currently shared with DCCP, and optimize it to reduce the number of cachelines accessed in the TCP fast path. Note that we keep DCCP netfilter modules as requested. [3] Link: https://lore.kernel.org/netdev/20230710182253.81446-1-kuniyu@amazon.com/T/#u #[0] Link: https://github.com/telekom/mp-dccp #[1] Link: https://github.com/telekom/mp-dccp/blob/mpdccp_v03_k5.10/net/dccp/non_gpl_scheduler/LICENSE #[2] Link: https://lore.kernel.org/netdev/Z_VQ0KlCRkqYWXa-@calendula/ #[3] Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Moore (LSM and SELinux) Acked-by: Casey Schaufler Link: https://patch.msgid.link/20250410023921.11307-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/dccp.h | 289 --------------------------------------------------- include/linux/tfrc.h | 51 --------- 2 files changed, 340 deletions(-) delete mode 100644 include/linux/tfrc.h (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 325af611909f..0b61b8b996d4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -2,79 +2,8 @@ #ifndef _LINUX_DCCP_H #define _LINUX_DCCP_H - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include #include -enum dccp_state { - DCCP_OPEN = TCP_ESTABLISHED, - DCCP_REQUESTING = TCP_SYN_SENT, - DCCP_LISTEN = TCP_LISTEN, - DCCP_RESPOND = TCP_SYN_RECV, - /* - * States involved in closing a DCCP connection: - * 1) ACTIVE_CLOSEREQ is entered by a server sending a CloseReq. - * - * 2) CLOSING can have three different meanings (RFC 4340, 8.3): - * a. Client has performed active-close, has sent a Close to the server - * from state OPEN or PARTOPEN, and is waiting for the final Reset - * (in this case, SOCK_DONE == 1). - * b. Client is asked to perform passive-close, by receiving a CloseReq - * in (PART)OPEN state. It sends a Close and waits for final Reset - * (in this case, SOCK_DONE == 0). - * c. Server performs an active-close as in (a), keeps TIMEWAIT state. - * - * 3) The following intermediate states are employed to give passively - * closing nodes a chance to process their unread data: - * - PASSIVE_CLOSE (from OPEN => CLOSED) and - * - PASSIVE_CLOSEREQ (from (PART)OPEN to CLOSING; case (b) above). - */ - DCCP_ACTIVE_CLOSEREQ = TCP_FIN_WAIT1, - DCCP_PASSIVE_CLOSE = TCP_CLOSE_WAIT, /* any node receiving a Close */ - DCCP_CLOSING = TCP_CLOSING, - DCCP_TIME_WAIT = TCP_TIME_WAIT, - DCCP_CLOSED = TCP_CLOSE, - DCCP_NEW_SYN_RECV = TCP_NEW_SYN_RECV, - DCCP_PARTOPEN = TCP_MAX_STATES, - DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ - DCCP_MAX_STATES -}; - -enum { - DCCPF_OPEN = TCPF_ESTABLISHED, - DCCPF_REQUESTING = TCPF_SYN_SENT, - DCCPF_LISTEN = TCPF_LISTEN, - DCCPF_RESPOND = TCPF_SYN_RECV, - DCCPF_ACTIVE_CLOSEREQ = TCPF_FIN_WAIT1, - DCCPF_CLOSING = TCPF_CLOSING, - DCCPF_TIME_WAIT = TCPF_TIME_WAIT, - DCCPF_CLOSED = TCPF_CLOSE, - DCCPF_NEW_SYN_RECV = TCPF_NEW_SYN_RECV, - DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN), -}; - -static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) -{ - return (struct dccp_hdr *)skb_transport_header(skb); -} - -static inline struct dccp_hdr *dccp_zeroed_hdr(struct sk_buff *skb, int headlen) -{ - skb_push(skb, headlen); - skb_reset_transport_header(skb); - return memset(skb_transport_header(skb), 0, headlen); -} - static inline struct dccp_hdr_ext *dccp_hdrx(const struct dccp_hdr *dh) { return (struct dccp_hdr_ext *)((unsigned char *)dh + sizeof(*dh)); @@ -85,12 +14,6 @@ static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh) return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); } -static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - return __dccp_basic_hdr_len(dh); -} - static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) { __u64 seq_nr = ntohs(dh->dccph_seq); @@ -103,222 +26,10 @@ static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) return seq_nr; } -static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) -{ - return (struct dccp_hdr_request *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) -{ - return (struct dccp_hdr_ack_bits *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) -{ - const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); - return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low); -} - -static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) -{ - return (struct dccp_hdr_response *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) -{ - return (struct dccp_hdr_reset *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh) { return __dccp_basic_hdr_len(dh) + dccp_packet_hdr_len(dh->dccph_type); } -static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) -{ - return __dccp_hdr_len(dccp_hdr(skb)); -} - -/** - * struct dccp_request_sock - represent DCCP-specific connection request - * @dreq_inet_rsk: structure inherited from - * @dreq_iss: initial sequence number, sent on the first Response (RFC 4340, 7.1) - * @dreq_gss: greatest sequence number sent (for retransmitted Responses) - * @dreq_isr: initial sequence number received in the first Request - * @dreq_gsr: greatest sequence number received (for retransmitted Request(s)) - * @dreq_service: service code present on the Request (there is just one) - * @dreq_featneg: feature negotiation options for this connection - * The following two fields are analogous to the ones in dccp_sock: - * @dreq_timestamp_echo: last received timestamp to echo (13.1) - * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo - */ -struct dccp_request_sock { - struct inet_request_sock dreq_inet_rsk; - __u64 dreq_iss; - __u64 dreq_gss; - __u64 dreq_isr; - __u64 dreq_gsr; - __be32 dreq_service; - spinlock_t dreq_lock; - struct list_head dreq_featneg; - __u32 dreq_timestamp_echo; - __u32 dreq_timestamp_time; -}; - -static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) -{ - return (struct dccp_request_sock *)req; -} - -extern struct inet_timewait_death_row dccp_death_row; - -extern int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb); - -struct dccp_options_received { - u64 dccpor_ndp:48; - u32 dccpor_timestamp; - u32 dccpor_timestamp_echo; - u32 dccpor_elapsed_time; -}; - -struct ccid; - -enum dccp_role { - DCCP_ROLE_UNDEFINED, - DCCP_ROLE_LISTEN, - DCCP_ROLE_CLIENT, - DCCP_ROLE_SERVER, -}; - -struct dccp_service_list { - __u32 dccpsl_nr; - __be32 dccpsl_list[]; -}; - -#define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1) -#define DCCP_SERVICE_CODE_IS_ABSENT 0 - -static inline bool dccp_list_has_service(const struct dccp_service_list *sl, - const __be32 service) -{ - if (likely(sl != NULL)) { - u32 i = sl->dccpsl_nr; - while (i--) - if (sl->dccpsl_list[i] == service) - return true; - } - return false; -} - -struct dccp_ackvec; - -/** - * struct dccp_sock - DCCP socket state - * - * @dccps_swl - sequence number window low - * @dccps_swh - sequence number window high - * @dccps_awl - acknowledgement number window low - * @dccps_awh - acknowledgement number window high - * @dccps_iss - initial sequence number sent - * @dccps_isr - initial sequence number received - * @dccps_osr - first OPEN sequence number received - * @dccps_gss - greatest sequence number sent - * @dccps_gsr - greatest valid sequence number received - * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss - * @dccps_service - first (passive sock) or unique (active sock) service code - * @dccps_service_list - second .. last service code on passive socket - * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option - * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo - * @dccps_l_ack_ratio - feature-local Ack Ratio - * @dccps_r_ack_ratio - feature-remote Ack Ratio - * @dccps_l_seq_win - local Sequence Window (influences ack number validity) - * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) - * @dccps_pcslen - sender partial checksum coverage (via sockopt) - * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) - * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) - * @dccps_ndp_count - number of Non Data Packets since last data packet - * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) - * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) - * @dccps_hc_rx_ackvec - rx half connection ack vector - * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) - * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) - * @dccps_options_received - parsed set of retrieved options - * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy - * @dccps_tx_qlen - maximum length of the TX queue - * @dccps_role - role of this sock, one of %dccp_role - * @dccps_hc_rx_insert_options - receiver wants to add options when acking - * @dccps_hc_tx_insert_options - sender wants to add options when sending - * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) - * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" - * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets - * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) - * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) - */ -struct dccp_sock { - /* inet_connection_sock has to be the first member of dccp_sock */ - struct inet_connection_sock dccps_inet_connection; -#define dccps_syn_rtt dccps_inet_connection.icsk_ack.lrcvtime - __u64 dccps_swl; - __u64 dccps_swh; - __u64 dccps_awl; - __u64 dccps_awh; - __u64 dccps_iss; - __u64 dccps_isr; - __u64 dccps_osr; - __u64 dccps_gss; - __u64 dccps_gsr; - __u64 dccps_gar; - __be32 dccps_service; - __u32 dccps_mss_cache; - struct dccp_service_list *dccps_service_list; - __u32 dccps_timestamp_echo; - __u32 dccps_timestamp_time; - __u16 dccps_l_ack_ratio; - __u16 dccps_r_ack_ratio; - __u64 dccps_l_seq_win:48; - __u64 dccps_r_seq_win:48; - __u8 dccps_pcslen:4; - __u8 dccps_pcrlen:4; - __u8 dccps_send_ndp_count:1; - __u64 dccps_ndp_count:48; - unsigned long dccps_rate_last; - struct list_head dccps_featneg; - struct dccp_ackvec *dccps_hc_rx_ackvec; - struct ccid *dccps_hc_rx_ccid; - struct ccid *dccps_hc_tx_ccid; - struct dccp_options_received dccps_options_received; - __u8 dccps_qpolicy; - __u32 dccps_tx_qlen; - enum dccp_role dccps_role:2; - __u8 dccps_hc_rx_insert_options:1; - __u8 dccps_hc_tx_insert_options:1; - __u8 dccps_server_timewait:1; - __u8 dccps_sync_scheduled:1; - struct tasklet_struct dccps_xmitlet; - struct timer_list dccps_xmit_timer; -}; - -#define dccp_sk(ptr) container_of_const(ptr, struct dccp_sock, \ - dccps_inet_connection.icsk_inet.sk) - -static inline const char *dccp_role(const struct sock *sk) -{ - switch (dccp_sk(sk)->dccps_role) { - case DCCP_ROLE_UNDEFINED: return "undefined"; - case DCCP_ROLE_LISTEN: return "listen"; - case DCCP_ROLE_SERVER: return "server"; - case DCCP_ROLE_CLIENT: return "client"; - } - return NULL; -} - -extern void dccp_syn_ack_timeout(const struct request_sock *req); - #endif /* _LINUX_DCCP_H */ diff --git a/include/linux/tfrc.h b/include/linux/tfrc.h deleted file mode 100644 index a5acc768085d..000000000000 --- a/include/linux/tfrc.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _LINUX_TFRC_H_ -#define _LINUX_TFRC_H_ -/* - * TFRC - Data Structures for the TCP-Friendly Rate Control congestion - * control mechanism as specified in RFC 3448. - * - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include - -/** tfrc_rx_info - TFRC Receiver Data Structure - * - * @tfrcrx_x_recv: receiver estimate of sending rate (3.2.2) - * @tfrcrx_rtt: round-trip-time (communicated by sender) - * @tfrcrx_p: current estimate of loss event rate (3.2.2) - */ -struct tfrc_rx_info { - __u32 tfrcrx_x_recv; - __u32 tfrcrx_rtt; - __u32 tfrcrx_p; -}; - -/** tfrc_tx_info - TFRC Sender Data Structure - * - * @tfrctx_x: computed transmit rate (4.3 (4)) - * @tfrctx_x_recv: receiver estimate of send rate (4.3) - * @tfrctx_x_calc: return value of throughput equation (3.1) - * @tfrctx_rtt: (moving average) estimate of RTT (4.3) - * @tfrctx_p: current loss event rate (5.4) - * @tfrctx_rto: estimate of RTO, equals 4*RTT (4.3) - * @tfrctx_ipi: inter-packet interval (4.6) - * - * Note: X and X_recv are both maintained in units of 64 * bytes/second. This - * enables a finer resolution of sending rates and avoids problems with - * integer arithmetic; u32 is not sufficient as scaling consumes 6 bits. - */ -struct tfrc_tx_info { - __u64 tfrctx_x; - __u64 tfrctx_x_recv; - __u32 tfrctx_x_calc; - __u32 tfrctx_rtt; - __u32 tfrctx_p; - __u32 tfrctx_rto; - __u32 tfrctx_ipi; -}; - -#endif /* _LINUX_TFRC_H_ */ -- cgit v1.2.3 From 88113e09ada52be28968aacf4af7b3d667832f00 Mon Sep 17 00:00:00 2001 From: Mukesh Kumar Savaliya Date: Fri, 4 Apr 2025 19:24:27 +0530 Subject: spi: Add support for Double Transfer Rate (DTR) mode Introduce support for protocol drivers to specify whether a transfer should use single or dual transfer mode. Currently, the SPI controller cannot determine this information from the user, leading to potential limitations in transfer capabilities. Add a new field `dtr_mode` in the `spi_transfer` structure. The `dtr_mode` field allows protocol drivers to indicate if Double Transfer Rate (DTR) mode is supported for a given transfer. When `dtr_mode` is set to true, the SPI controller will use DTR mode; otherwise, it will default to single transfer mode. Introduce another field `dtr_caps` to indicate if the QSPI controller is capable of supporting DTR mode (SDR and DDR). By default, both `dtr_caps` and `dtr_mode` will be false. These flags manage the QSPI controller's DTR mode capabilities within the SPI framework. The QSPI controller driver uses these flags to configure single or double transfer rates using the controller register. The existing spi-mem driver helps configure the DTR mode but is limited to memory devices. There is no support available to set DTR mode for non-memory devices, e.g., touch or any generic SPI sensor. This change is backward compatible and doesn't break existing SPI or QSPI drivers. Changes include: - Addition of `dtr_mode` and `dtr_caps` fields in the `spi_transfer` structure. - Documentation updates to reflect the new `dtr_mode` and `dtr_caps` fields. Signed-off-by: Mukesh Kumar Savaliya Link: https://patch.msgid.link/20250404135427.313825-1-quic_msavaliy@quicinc.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e10f0ebb8250..834a09bd8ccc 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -508,6 +508,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * found. * @put_offload: release the offload instance acquired by @get_offload. * @mem_caps: controller capabilities for the handling of memory operations. + * @dtr_caps: true if controller has dtr(single/dual transfer rate) capability. + * QSPI based controller should fill this based on controller's capability. * @unprepare_message: undo any work done by prepare_message(). * @target_abort: abort the ongoing transfer request on an SPI target controller * @cs_gpiods: Array of GPIO descriptors to use as chip select lines; one per CS @@ -751,6 +753,9 @@ struct spi_controller { const struct spi_controller_mem_ops *mem_ops; const struct spi_controller_mem_caps *mem_caps; + /* SPI or QSPI controller can set to true if supports SDR/DDR transfer rate */ + bool dtr_caps; + struct spi_offload *(*get_offload)(struct spi_device *spi, const struct spi_offload_config *config); void (*put_offload)(struct spi_offload *offload); @@ -1003,6 +1008,7 @@ struct spi_res { * processed the word, i.e. the "pre" timestamp should be taken before * transmitting the "pre" word, and the "post" timestamp after receiving * transmit confirmation from the controller for the "post" word. + * @dtr_mode: true if supports double transfer rate. * @timestamped: true if the transfer has been timestamped * @error: Error status logged by SPI controller driver. * @@ -1054,6 +1060,9 @@ struct spi_res { * two should both be set. User can set transfer mode with SPI_NBITS_SINGLE(1x) * SPI_NBITS_DUAL(2x) and SPI_NBITS_QUAD(4x) to support these three transfer. * + * User may also set dtr_mode to true to use dual transfer mode if desired. if + * not, default considered as single transfer mode. + * * The code that submits an spi_message (and its spi_transfers) * to the lower layers is responsible for managing its memory. * Zero-initialize every field you don't set up explicitly, to @@ -1088,6 +1097,7 @@ struct spi_transfer { unsigned tx_nbits:4; unsigned rx_nbits:4; unsigned timestamped:1; + bool dtr_mode; #define SPI_NBITS_SINGLE 0x01 /* 1-bit transfer */ #define SPI_NBITS_DUAL 0x02 /* 2-bit transfer */ #define SPI_NBITS_QUAD 0x04 /* 4-bit transfer */ -- cgit v1.2.3 From 7cfe1e208b86c7fd4b35a8a502a8b15604eec1e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 3 Apr 2025 17:11:32 +0200 Subject: pwm: Make chip parameter to pwmchip_get_drvdata() a const pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dev_get_drvdata()'s parameter is a const pointer, so the chip passed to pwmchip_get_drvdata() can be const, too. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20250403151134.266388-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 9ece4e5d3815..bf0469b2201d 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -369,7 +369,7 @@ static inline struct device *pwmchip_parent(const struct pwm_chip *chip) return chip->dev.parent; } -static inline void *pwmchip_get_drvdata(struct pwm_chip *chip) +static inline void *pwmchip_get_drvdata(const struct pwm_chip *chip) { return dev_get_drvdata(&chip->dev); } -- cgit v1.2.3 From cb7ca40a3882360ce87191793449d48df0b29184 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 23:11:22 +0200 Subject: x86/fpu: Make task_struct::thread constant size Turn thread.fpu into a pointer. Since most FPU code internals work by passing around the FPU pointer already, the code generation impact is small. This allows us to remove the old kludge of task_struct being variable size: struct task_struct { ... /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end /* CPU-specific state of this task: */ struct thread_struct thread; /* * WARNING: on x86, 'thread_struct' contains a variable-sized * structure. It *MUST* be at the end of 'task_struct'. * * Do not put anything below here! */ }; ... which creates a number of problems, such as requiring thread_struct to be the last member of the struct - not allowing it to be struct-randomized, etc. But the primary motivation is to allow the decoupling of task_struct from hardware details ( in particular), and to eventually allow the per-task infrastructure: DECLARE_PER_TASK(type, name); ... per_task(current, name) = val; ... which requires task_struct to be a constant size struct. The fpu_thread_struct_whitelist() quirk to hardened usercopy can be removed, now that the FPU structure is not embedded in the task struct anymore, which reduces text footprint a bit. Fixed-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Chang S. Bae Cc: H. Peter Anvin Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250409211127.3544993-4-mingo@kernel.org --- include/linux/sched.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..4ecc0c6b1cb0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1646,22 +1646,15 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif - /* - * New fields for task_struct should be added above here, so that - * they are included in the randomized portion of task_struct. - */ - randomized_struct_fields_end - /* CPU-specific state of this task: */ struct thread_struct thread; /* - * WARNING: on x86, 'thread_struct' contains a variable-sized - * structure. It *MUST* be at the end of 'task_struct'. - * - * Do not put anything below here! + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. */ -}; + randomized_struct_fields_end +} __attribute__ ((aligned (64))); #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) -- cgit v1.2.3 From 34180863e000f325e5d11a2fd4af300a0ae50f71 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 8 Apr 2025 16:44:27 +0800 Subject: firmware: arm_scmi: imx: Add i.MX95 LMM protocol Add Logical Machine Management(LMM) protocol which is intended for boot, shutdown, and reset of other logical machines (LM). It is usually used to allow one LM to manager another used as an offload or accelerator engine. Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Message-Id: <20250408-imx-lmm-cpu-v4-3-4c5f4a456e49@nxp.com> Signed-off-by: Sudeep Holla --- include/linux/scmi_imx_protocol.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_imx_protocol.h b/include/linux/scmi_imx_protocol.h index 53b356a26414..2a96fc29cb6f 100644 --- a/include/linux/scmi_imx_protocol.h +++ b/include/linux/scmi_imx_protocol.h @@ -11,8 +11,10 @@ #include #include #include +#include #include +#define SCMI_PROTOCOL_IMX_LMM 0x80 #define SCMI_PROTOCOL_IMX_BBM 0x81 #define SCMI_PROTOCOL_IMX_MISC 0x84 @@ -57,4 +59,34 @@ struct scmi_imx_misc_proto_ops { int (*misc_ctrl_req_notify)(const struct scmi_protocol_handle *ph, u32 ctrl_id, u32 evt_id, u32 flags); }; + +/* See LMM_ATTRIBUTES in imx95.rst */ +#define LMM_ID_DISCOVER 0xFFFFFFFFU +#define LMM_MAX_NAME 16 + +enum scmi_imx_lmm_state { + LMM_STATE_LM_OFF, + LMM_STATE_LM_ON, + LMM_STATE_LM_SUSPEND, + LMM_STATE_LM_POWERED, +}; + +struct scmi_imx_lmm_info { + u32 lmid; + enum scmi_imx_lmm_state state; + u32 errstatus; + u8 name[LMM_MAX_NAME]; +}; + +struct scmi_imx_lmm_proto_ops { + int (*lmm_power_boot)(const struct scmi_protocol_handle *ph, u32 lmid, + bool boot); + int (*lmm_info)(const struct scmi_protocol_handle *ph, u32 lmid, + struct scmi_imx_lmm_info *info); + int (*lmm_reset_vector_set)(const struct scmi_protocol_handle *ph, + u32 lmid, u32 cpuid, u32 flags, u64 vector); + int (*lmm_shutdown)(const struct scmi_protocol_handle *ph, u32 lmid, + u32 flags); +}; + #endif -- cgit v1.2.3 From e68c305bc2f3cdc0b6fa21cd9579650c1457c070 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 8 Apr 2025 16:44:28 +0800 Subject: firmware: arm_scmi: imx: Add i.MX95 CPU Protocol This protocol allows an agent to start, stop a CPU or set reset vector. It is used to manage auxiliary CPUs in an LM (e.g. additional cores in an AP cluster). Signed-off-by: Peng Fan Message-Id: <20250408-imx-lmm-cpu-v4-4-4c5f4a456e49@nxp.com> Signed-off-by: Sudeep Holla --- include/linux/scmi_imx_protocol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_imx_protocol.h b/include/linux/scmi_imx_protocol.h index 2a96fc29cb6f..27bd372cbfb1 100644 --- a/include/linux/scmi_imx_protocol.h +++ b/include/linux/scmi_imx_protocol.h @@ -16,6 +16,7 @@ #define SCMI_PROTOCOL_IMX_LMM 0x80 #define SCMI_PROTOCOL_IMX_BBM 0x81 +#define SCMI_PROTOCOL_IMX_CPU 0x82 #define SCMI_PROTOCOL_IMX_MISC 0x84 #define SCMI_IMX_VENDOR "NXP" @@ -89,4 +90,13 @@ struct scmi_imx_lmm_proto_ops { u32 flags); }; +struct scmi_imx_cpu_proto_ops { + int (*cpu_reset_vector_set)(const struct scmi_protocol_handle *ph, + u32 cpuid, u64 vector, bool start, + bool boot, bool resume); + int (*cpu_start)(const struct scmi_protocol_handle *ph, u32 cpuid, + bool start); + int (*cpu_started)(const struct scmi_protocol_handle *ph, u32 cpuid, + bool *started); +}; #endif -- cgit v1.2.3 From 7242bbf418f0521c0e3dd7034b7b3cc4413d7d5a Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 8 Apr 2025 16:44:29 +0800 Subject: firmware: imx: Add i.MX95 SCMI LMM driver The i.MX95 System manager exports SCMI LMM protocol for linux to manage Logical Machines. The driver is to use the LMM Protocol interface to boot, shutdown a LM. Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Message-Id: <20250408-imx-lmm-cpu-v4-5-4c5f4a456e49@nxp.com> Signed-off-by: Sudeep Holla --- include/linux/firmware/imx/sm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index 9b85a3f028d1..bc27b04afb2f 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -8,6 +8,7 @@ #include #include +#include #include #define SCMI_IMX_CTRL_PDM_CLK_SEL 0 /* AON PDM clock sel */ @@ -20,4 +21,17 @@ int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); int scmi_imx_misc_ctrl_set(u32 id, u32 val); +enum scmi_imx_lmm_op { + SCMI_IMX_LMM_BOOT, + SCMI_IMX_LMM_POWER_ON, + SCMI_IMX_LMM_SHUTDOWN, +}; + +/* For shutdown pperation */ +#define SCMI_IMX_LMM_OP_FORCEFUL 0 +#define SCMI_IMX_LMM_OP_GRACEFUL BIT(0) + +int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags); +int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info); +int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector); #endif -- cgit v1.2.3 From 1055faa5d6606cccc706e37956f6ff8fb7c22d55 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 8 Apr 2025 16:44:30 +0800 Subject: firmware: imx: Add i.MX95 SCMI CPU driver The i.MX95 System manager exports SCMI CPU protocol for linux to manage cpu cores. The driver is to use the cpu Protocol interface to start, stop a cpu cores (eg, M7). Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Message-Id: <20250408-imx-lmm-cpu-v4-6-4c5f4a456e49@nxp.com> Signed-off-by: Sudeep Holla --- include/linux/firmware/imx/sm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index bc27b04afb2f..a8a17eeb7d90 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -21,6 +21,11 @@ int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); int scmi_imx_misc_ctrl_set(u32 id, u32 val); +int scmi_imx_cpu_start(u32 cpuid, bool start); +int scmi_imx_cpu_started(u32 cpuid, bool *started); +int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, bool boot, + bool resume); + enum scmi_imx_lmm_op { SCMI_IMX_LMM_BOOT, SCMI_IMX_LMM_POWER_ON, -- cgit v1.2.3 From 097f171f98289cf737437599c40b0d1e81266e9e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:42:46 -0700 Subject: net: convert dev->rtnl_link_state to a bool netdevice reg_state was split into two 16 bit enums back in 2010 in commit a2835763e130 ("rtnetlink: handle rtnl_link netlink notifications manually"). Since the split the fields have been moved apart, and last year we converted reg_state to a normal u8 in commit 4d42b37def70 ("net: convert dev->reg_state to u8"). rtnl_link_state being a 16 bitfield makes no sense. Convert it to a single bool, it seems very unlikely after 15 years that we'll need more values in it. We could drop dev->rtnl_link_ops from the conditions but feels like having it there more clearly points at the reason for this hack. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250410014246.780885-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8544f6a680c..e6036b82ef4c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1946,9 +1946,6 @@ enum netdev_reg_state { * * @reg_state: Register/unregister state machine * @dismantle: Device is going to be freed - * @rtnl_link_state: This enum represents the phases of creating - * a new link - * * @needs_free_netdev: Should unregister perform free_netdev? * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one @@ -2363,11 +2360,8 @@ struct net_device { /** @moving_ns: device is changing netns, protected by @lock */ bool moving_ns; - - enum { - RTNL_LINK_INITIALIZED, - RTNL_LINK_INITIALIZING, - } rtnl_link_state:16; + /** @rtnl_link_initializing: Device being created, suppress events */ + bool rtnl_link_initializing; bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); -- cgit v1.2.3 From cd3c93167da0e760b5819246eae7a4ea30fd014b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Apr 2025 12:41:36 +0200 Subject: page_pool: Move pp_magic check into helper functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we are about to stash some more information into the pp_magic field, let's move the magic signature checks into a pair of helper functions so it can be changed in one place. Reviewed-by: Mina Almasry Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-1-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b7f13f087954..56c47f4a38ca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4248,4 +4248,24 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define VM_SEALED_SYSMAP VM_NONE #endif +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page. + * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling + * the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~0x3UL + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(struct page *page) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From ee62ce7a1d909ccba0399680a03c2dee83bcae95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Apr 2025 12:41:37 +0200 Subject: page_pool: Track DMA-mapped pages and unmap them when destroying the pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When enabling DMA mapping in page_pool, pages are kept DMA mapped until they are released from the pool, to avoid the overhead of re-mapping the pages every time they are used. This causes resource leaks and/or crashes when there are pages still outstanding while the device is torn down, because page_pool will attempt an unmap through a non-existent DMA device on the subsequent page return. To fix this, implement a simple tracking of outstanding DMA-mapped pages in page pool using an xarray. This was first suggested by Mina[0], and turns out to be fairly straight forward: We simply store pointers to pages directly in the xarray with xa_alloc() when they are first DMA mapped, and remove them from the array on unmap. Then, when a page pool is torn down, it can simply walk the xarray and unmap all pages still present there before returning, which also allows us to get rid of the get/put_device() calls in page_pool. Using xa_cmpxchg(), no additional synchronisation is needed, as a page will only ever be unmapped once. To avoid having to walk the entire xarray on unmap to find the page reference, we stash the ID assigned by xa_alloc() into the page structure itself, using the upper bits of the pp_magic field. This requires a couple of defines to avoid conflicting with the POINTER_POISON_DELTA define, but this is all evaluated at compile-time, so does not affect run-time performance. The bitmap calculations in this patch gives the following number of bits for different architectures: - 23 bits on 32-bit architectures - 21 bits on PPC64 (because of the definition of ILLEGAL_POINTER_VALUE) - 32 bits on other 64-bit architectures Stashing a value into the unused bits of pp_magic does have the effect that it can make the value stored there lie outside the unmappable range (as governed by the mmap_min_addr sysctl), for architectures that don't define ILLEGAL_POINTER_VALUE. This means that if one of the pointers that is aliased to the pp_magic field (such as page->lru.next) is dereferenced while the page is owned by page_pool, that could lead to a dereference into userspace, which is a security concern. The risk of this is mitigated by the fact that (a) we always clear pp_magic before releasing a page from page_pool, and (b) this would need a use-after-free bug for struct page, which can have many other risks since page->lru.next is used as a generic list pointer in multiple places in the kernel. As such, with this patch we take the position that this risk is negligible in practice. For more discussion, see[1]. Since all the tracking added in this patch is performed on DMA map/unmap, no additional code is needed in the fast path, meaning the performance overhead of this tracking is negligible there. A micro-benchmark shows that the total overhead of the tracking itself is about 400 ns (39 cycles(tsc) 395.218 ns; sum for both map and unmap[2]). Since this cost is only paid on DMA map and unmap, it seems like an acceptable cost to fix the late unmap issue. Further optimisation can narrow the cases where this cost is paid (for instance by eliding the tracking when DMA map/unmap is a no-op). The extra memory needed to track the pages is neatly encapsulated inside xarray, which uses the 'struct xa_node' structure to track items. This structure is 576 bytes long, with slots for 64 items, meaning that a full node occurs only 9 bytes of overhead per slot it tracks (in practice, it probably won't be this efficient, but in any case it should be an acceptable overhead). [0] https://lore.kernel.org/all/CAHS8izPg7B5DwKfSuzz-iOop_YRbk3Sd6Y4rX7KBG9DcVJcyWg@mail.gmail.com/ [1] https://lore.kernel.org/r/20250320023202.GA25514@openwall.com [2] https://lore.kernel.org/r/ae07144c-9295-4c9d-a400-153bb689fe9e@huawei.com Reported-by: Yonglong Liu Closes: https://lore.kernel.org/r/8743264a-9700-4227-a556-5f931c720211@huawei.com Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") Suggested-by: Mina Almasry Reviewed-by: Mina Almasry Reviewed-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Qiuling Ren Tested-by: Yuying Ma Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-2-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 46 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/poison.h | 4 ++++ 2 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 56c47f4a38ca..130d3c9d2ee4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4248,13 +4248,51 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define VM_SEALED_SYSMAP VM_NONE #endif +/* + * DMA mapping IDs for page_pool + * + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot + * mistake a valid kernel pointer with any of the values we write into this + * field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we make sure that we leave the two topmost bits empty, as that guarantees + * we won't mistake a valid kernel pointer for a value we set, regardless of the + * VMSPLIT setting. + * + * Altogether, this means that the number of bits available is constrained by + * the size of an unsigned long (at the upper end, subtracting two bits per the + * above), and the definition of PP_SIGNATURE (with or without + * POISON_POINTER_DELTA). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Always leave out the topmost two; see above. */ +#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) + /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page. - * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling - * the pfmemalloc page. + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. */ -#define PP_MAGIC_MASK ~0x3UL +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) #ifdef CONFIG_PAGE_POOL static inline bool page_pool_page_is_pp(struct page *page) diff --git a/include/linux/poison.h b/include/linux/poison.h index 331a9a996fa8..8ca2235f78d5 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -70,6 +70,10 @@ #define KEY_DESTROY 0xbd /********** net/core/page_pool.c **********/ +/* + * page_pool uses additional free bits within this value to store data, see the + * definition of PP_DMA_INDEX_MASK in mm.h + */ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) /********** net/core/skbuff.c **********/ -- cgit v1.2.3 From ceaceaf79ea0fe337344fc5c1fb10a421a362410 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Apr 2025 23:04:19 +0100 Subject: net: ethtool: fix get_ts_stats() documentation Commit 0e9c127729be ("ethtool: add interface to read Tx hardware timestamping statistics") added documentation for timestamping statistics, but added the detailed explanation for this method to the get_ts_info() rather than get_ts_stats(). Move it to the correct entry. Cc: Rahul Rameshbabu Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3MTz-000Crx-IW@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 8210ece94fa6..013d25858642 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -926,10 +926,11 @@ struct kernel_ethtool_ts_info { * @get_ts_info: Get the time stamping and PTP hardware clock capabilities. * It may be called with RCU, or rtnl or reference on the device. * Drivers supporting transmit time stamps in software should set this to - * ethtool_op_get_ts_info(). Drivers must not zero statistics which they - * don't report. The stats structure is initialized to ETHTOOL_STAT_NOT_SET - * indicating driver does not report statistics. - * @get_ts_stats: Query the device hardware timestamping statistics. + * ethtool_op_get_ts_info(). + * @get_ts_stats: Query the device hardware timestamping statistics. Drivers + * must not zero statistics which they don't report. The stats structure + * is initialized to ETHTOOL_STAT_NOT_SET indicating driver does not + * report statistics. * @get_module_info: Get the size and type of the eeprom contained within * a plug-in module. * @get_module_eeprom: Get the eeprom information from the plug-in module -- cgit v1.2.3 From 651f88cb046c5e002f7c11de2cebf207787d2346 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 09:08:45 +0100 Subject: net: stmmac: remove eee_usecs_rate plat_dat->eee_users_rate is now unused, so remove this member. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3Vuv-000E7y-9k@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c4ec8bb8144e..8aed09d65b4a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -276,7 +276,6 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; int has_xgmac; u8 vlan_fail_q; - unsigned long eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int msi_mac_vec; -- cgit v1.2.3 From d30f83d278a921485b3e877d03fe937bccfcbcdd Mon Sep 17 00:00:00 2001 From: Raviteja Laggyshetty Date: Tue, 15 Apr 2025 09:53:38 +0000 Subject: interconnect: core: Add dynamic id allocation support The current interconnect framework relies on static IDs for node creation and registration, which limits topologies with multiple instances of the same interconnect provider. To address this, introduce icc_node_create_dyn() and icc_link_nodes() APIs to dynamically allocate IDs for interconnect nodes during creation and link. This change removes the dependency on static IDs, allowing multiple instances of the same hardware, such as EPSS L3. Signed-off-by: Raviteja Laggyshetty Link: https://lore.kernel.org/r/20250415095343.32125-3-quic_rlaggysh@quicinc.com Signed-off-by: Georgi Djakov --- include/linux/interconnect-provider.h | 12 ++++++++++++ include/linux/interconnect.h | 3 +++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interconnect-provider.h b/include/linux/interconnect-provider.h index f5aef8784692..55cfebc658e6 100644 --- a/include/linux/interconnect-provider.h +++ b/include/linux/interconnect-provider.h @@ -116,8 +116,10 @@ struct icc_node { int icc_std_aggregate(struct icc_node *node, u32 tag, u32 avg_bw, u32 peak_bw, u32 *agg_avg, u32 *agg_peak); +struct icc_node *icc_node_create_dyn(void); struct icc_node *icc_node_create(int id); void icc_node_destroy(int id); +int icc_link_nodes(struct icc_node *src_node, struct icc_node **dst_node); int icc_link_create(struct icc_node *node, const int dst_id); void icc_node_add(struct icc_node *node, struct icc_provider *provider); void icc_node_del(struct icc_node *node); @@ -136,6 +138,11 @@ static inline int icc_std_aggregate(struct icc_node *node, u32 tag, u32 avg_bw, return -ENOTSUPP; } +static inline struct icc_node *icc_node_create_dyn(void) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline struct icc_node *icc_node_create(int id) { return ERR_PTR(-ENOTSUPP); @@ -145,6 +152,11 @@ static inline void icc_node_destroy(int id) { } +static inline int icc_link_nodes(struct icc_node *src_node, struct icc_node **dst_node) +{ + return -EOPNOTSUPP; +} + static inline int icc_link_create(struct icc_node *node, const int dst_id) { return -ENOTSUPP; diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h index 97ac253df62c..e4b8808823ad 100644 --- a/include/linux/interconnect.h +++ b/include/linux/interconnect.h @@ -20,6 +20,9 @@ #define Mbps_to_icc(x) ((x) * 1000 / 8) #define Gbps_to_icc(x) ((x) * 1000 * 1000 / 8) +/* macro to indicate dynamic id allocation */ +#define ICC_ALLOC_DYN_ID -1 + struct icc_path; struct device; -- cgit v1.2.3 From 0c4f8ed498cea1e2beebf7aa3d198fa381264f88 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 14 Apr 2025 15:22:09 -0700 Subject: mm: skip folio reclaim in legacy memcg contexts for deadlockable mappings Currently in shrink_folio_list(), reclaim for folios under writeback falls into 3 different cases: 1) Reclaim is encountering an excessive number of folios under writeback and this folio has both the writeback and reclaim flags set 2) Dirty throttling is enabled (this happens if reclaim through cgroup is not enabled, if reclaim through cgroupv2 memcg is enabled, or if reclaim is on the root cgroup), or if the folio is not marked for immediate reclaim, or if the caller does not have __GFP_FS (or __GFP_IO if it's going to swap) set 3) Legacy cgroupv1 encounters a folio that already has the reclaim flag set and the caller did not have __GFP_FS (or __GFP_IO if swap) set In cases 1) and 2), we activate the folio and skip reclaiming it while in case 3), we wait for writeback to finish on the folio and then try to reclaim the folio again. In case 3, we wait on writeback because cgroupv1 does not have dirty folio throttling, as such this is a mitigation against the case where there are too many folios in writeback with nothing else to reclaim. If a filesystem (eg fuse) may deadlock due to reclaim waiting on writeback, then the filesystem needs to add inefficient messy workarounds to prevent this. To improve the performance of these filesystems, this commit adds two things: a) a AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM mapping flag that filesystems may set to indicate that reclaim should not wait on writeback b) if legacy memcg encounters a folio with this AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM flag set (eg case 3), the folio will be activated and skip reclaim (eg default to behavior in case 2) instead. Signed-off-by: Joanne Koong Reviewed-by: Shakeel Butt Reviewed-by: Jeff Layton Acked-by: David Hildenbrand Reviewed-by: Jingbo Xu Signed-off-by: Miklos Szeredi --- include/linux/pagemap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 26baa78f1ca7..e19ff3183bbf 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -210,6 +210,7 @@ enum mapping_flags { AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ + AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -335,6 +336,16 @@ static inline bool mapping_inaccessible(struct address_space *mapping) return test_bit(AS_INACCESSIBLE, &mapping->flags); } +static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_space *mapping) +{ + set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); +} + +static inline bool mapping_writeback_may_deadlock_on_reclaim(struct address_space *mapping) +{ + return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; -- cgit v1.2.3 From eaa0d30216c1d54b157ea0ad7f35ba76f6e9a825 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Tue, 18 Feb 2025 20:29:46 +0100 Subject: driver core: auxiliary bus: add device creation helpers Add helper functions to create a device on the auxiliary bus. This is meant for fairly simple usage of the auxiliary bus, to avoid having the same code repeated in the different drivers. Suggested-by: Stephen Boyd Cc: Arnd Bergmann Signed-off-by: Jerome Brunet Reviewed-by: Greg Kroah-Hartman Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20250218-aux-device-create-helper-v4-1-c3d7dfdea2e6@baylibre.com Signed-off-by: Greg Kroah-Hartman --- include/linux/auxiliary_bus.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 65dd7f154374..4086afd0cc6b 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -254,6 +254,23 @@ int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, struct module * void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv); +struct auxiliary_device *auxiliary_device_create(struct device *dev, + const char *modname, + const char *devname, + void *platform_data, + int id); +void auxiliary_device_destroy(void *auxdev); + +struct auxiliary_device *__devm_auxiliary_device_create(struct device *dev, + const char *modname, + const char *devname, + void *platform_data, + int id); + +#define devm_auxiliary_device_create(dev, devname, platform_data) \ + __devm_auxiliary_device_create(dev, KBUILD_MODNAME, devname, \ + platform_data, 0) + /** * module_auxiliary_driver() - Helper macro for registering an auxiliary driver * @__auxiliary_driver: auxiliary driver struct -- cgit v1.2.3 From 981527828c301644bc4014faa9c523e8a5e32a32 Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Sat, 12 Apr 2025 12:18:37 +0300 Subject: platform/mellanox: Rename field to improve code readability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename field 'counter' in 'mlxreg_core_hotplug_platform_data' to count. Signed-off-by: Vadim Pasternak Link: https://lore.kernel.org/r/20250412091843.33943-2-vadimp@nvidia.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/mlxreg.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h index 0b9f81a6f753..f6cca7a035c7 100644 --- a/include/linux/platform_data/mlxreg.h +++ b/include/linux/platform_data/mlxreg.h @@ -209,7 +209,7 @@ struct mlxreg_core_platform_data { * @items: same type components with the hotplug capability; * @irq: platform interrupt number; * @regmap: register map of parent device; - * @counter: number of the components with the hotplug capability; + * @count: number of the components with the hotplug capability; * @cell: location of top aggregation interrupt register; * @mask: top aggregation interrupt common mask; * @cell_low: location of low aggregation interrupt register; @@ -224,7 +224,7 @@ struct mlxreg_core_hotplug_platform_data { struct mlxreg_core_item *items; int irq; void *regmap; - int counter; + int count; u32 cell; u32 mask; u32 cell_low; -- cgit v1.2.3 From f99564688f38458d86b64f099ebf03f19517cf77 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 13 Apr 2025 16:09:40 +0200 Subject: net: phy: remove device_phy_find_device AFAICS this function has never had a user. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/ab7b8094-2eea-4e82-a047-fd60117f220b@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index a2bfae80c449..fb755358d965 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1757,7 +1757,6 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); -struct phy_device *device_phy_find_device(struct device *dev); struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); @@ -1779,11 +1778,6 @@ struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) return NULL; } -static inline struct phy_device *device_phy_find_device(struct device *dev) -{ - return NULL; -} - static inline struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) { -- cgit v1.2.3 From 97d06802d10a2827ef46fd31789a26117ce7f3d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 13 Mar 2025 16:57:45 +0100 Subject: sysfs: constify bin_attribute argument of bin_attribute::read/write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All callback implementers have been moved to the const variant of the callbacks. The signature of the original callbacks can now be changed. Also remove the now unnecessary transition machinery inside __BIN_ATTR(). Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250313-sysfs-const-bin_attr-final-v2-1-96284e1e88ce@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 18f7e1fd093c..576b8b3c60af 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -306,11 +306,11 @@ struct bin_attribute { size_t size; void *private; struct address_space *(*f_mapping)(void); - ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, + ssize_t (*read)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); - ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, + ssize_t (*write)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write_new)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); @@ -332,28 +332,11 @@ struct bin_attribute { */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) -typedef ssize_t __sysfs_bin_rw_handler_new(struct file *, struct kobject *, - const struct bin_attribute *, char *, loff_t, size_t); - /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ - .read = _Generic(_read, \ - __sysfs_bin_rw_handler_new * : NULL, \ - default : _read \ - ), \ - .read_new = _Generic(_read, \ - __sysfs_bin_rw_handler_new * : _read, \ - default : NULL \ - ), \ - .write = _Generic(_write, \ - __sysfs_bin_rw_handler_new * : NULL, \ - default : _write \ - ), \ - .write_new = _Generic(_write, \ - __sysfs_bin_rw_handler_new * : _write, \ - default : NULL \ - ), \ + .read = _read, \ + .write = _write, \ .size = _size, \ } -- cgit v1.2.3 From 9bec944506faf10d6274c75e3a55bc107b75cea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 13 Mar 2025 16:57:46 +0100 Subject: sysfs: constify attribute_group::bin_attrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All users of this field have been migrated to bin_attrs_new. It can now be constified. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250313-sysfs-const-bin_attr-final-v2-2-96284e1e88ce@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 576b8b3c60af..f418aae4f113 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -107,7 +107,7 @@ struct attribute_group { int); struct attribute **attrs; union { - struct bin_attribute **bin_attrs; + const struct bin_attribute *const *bin_attrs; const struct bin_attribute *const *bin_attrs_new; }; }; -- cgit v1.2.3 From 2af781a9edc4ef5f6684c0710cc3542d9be48b31 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 10 Apr 2025 17:27:12 +0200 Subject: PCI: pciehp: Ignore Link Down/Up caused by Secondary Bus Reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a Secondary Bus Reset is issued at a hotplug port, it causes a Data Link Layer State Changed event as a side effect. On hotplug ports using in-band presence detect, it additionally causes a Presence Detect Changed event. These spurious events should not result in teardown and re-enumeration of the device in the slot. Hence commit 2e35afaefe64 ("PCI: pciehp: Add reset_slot() method") masked the Presence Detect Changed Enable bit in the Slot Control register during a Secondary Bus Reset. Commit 06a8d89af551 ("PCI: pciehp: Disable link notification across slot reset") additionally masked the Data Link Layer State Changed Enable bit. However masking those bits only disables interrupt generation (PCIe r6.2 sec 6.7.3.1). The events are still visible in the Slot Status register and picked up by the IRQ handler if it runs during a Secondary Bus Reset. This can happen if the interrupt is shared or if an unmasked hotplug event occurs, e.g. Attention Button Pressed or Power Fault Detected. The likelihood of this happening used to be small, so it wasn't much of a problem in practice. That has changed with the recent introduction of bandwidth control in v6.13-rc1 with commit 665745f27487 ("PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller"): Bandwidth control shares the interrupt with PCIe hotplug. A Secondary Bus Reset causes a Link Bandwidth Notification, so the hotplug IRQ handler runs, picks up the masked events and tears down the device in the slot. As a result, Joel reports VFIO passthrough failure of a GPU, which Ilpo root-caused to the incorrect handling of masked hotplug events. Clearly, a more reliable way is needed to ignore spurious hotplug events. For Downstream Port Containment, a new ignore mechanism was introduced by commit a97396c6eb13 ("PCI: pciehp: Ignore Link Down/Up caused by DPC"). It has been working reliably for the past four years. Adapt it for Secondary Bus Resets. Introduce two helpers to annotate code sections which cause spurious link changes: pci_hp_ignore_link_change() and pci_hp_unignore_link_change() Use those helpers in lieu of masking interrupts in the Slot Control register. Introduce a helper to check whether such a code section is executing concurrently and if so, await it: pci_hp_spurious_link_change() Invoke the helper in the hotplug IRQ thread pciehp_ist(). Re-use the IRQ thread's existing code which ignores DPC-induced link changes unless the link is unexpectedly down after reset recovery or the device was replaced during the bus reset. That code block in pciehp_ist() was previously only executed if a Data Link Layer State Changed event has occurred. Additionally execute it for Presence Detect Changed events. That's necessary for compatibility with PCIe r1.0 hotplug ports because Data Link Layer State Changed didn't exist before PCIe r1.1. DPC was added with PCIe r3.1 and thus DPC-capable hotplug ports always support Data Link Layer State Changed events. But the same cannot be assumed for Secondary Bus Reset, which already existed in PCIe r1.0. Secondary Bus Reset is only one of many causes of spurious link changes. Others include runtime suspend to D3cold, firmware updates or FPGA reconfiguration. The new pci_hp_{,un}ignore_link_change() helpers may be used by all kinds of drivers to annotate such code sections, hence their declarations are publicly visible in . A case in point is the Mellanox Ethernet driver which disables a firmware reset feature if the Ethernet card is attached to a hotplug port, see commit 3d7a3f2612d7 ("net/mlx5: Nack sync reset request when HotPlug is enabled"). Going forward, PCIe hotplug will be able to cope gracefully with all such use cases once the code sections are properly annotated. The new helpers internally use two bits in struct pci_dev's priv_flags as well as a wait_queue. This mirrors what was done for DPC by commit a97396c6eb13 ("PCI: pciehp: Ignore Link Down/Up caused by DPC"). That may be insufficient if spurious link changes are caused by multiple sources simultaneously. An example might be a Secondary Bus Reset issued by AER during FPGA reconfiguration. If this turns out to happen in real life, support for it can easily be added by replacing the PCI_LINK_CHANGING flag with an atomic_t counter incremented by pci_hp_ignore_link_change() and decremented by pci_hp_unignore_link_change(). Instead of awaiting a zero PCI_LINK_CHANGING flag, the pci_hp_spurious_link_change() helper would then simply await a zero counter. Fixes: 665745f27487 ("PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller") Reported-by: Joel Mathew Thomas Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219765 Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Tested-by: Joel Mathew Thomas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/d04deaf49d634a2edf42bf3c06ed81b4ca54d17b.1744298239.git.lukas@wunner.de --- include/linux/pci.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..833b54f3ce6d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1848,6 +1848,14 @@ static inline bool pcie_aspm_support_enabled(void) { return false; } static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; } #endif +#ifdef CONFIG_HOTPLUG_PCI +void pci_hp_ignore_link_change(struct pci_dev *pdev); +void pci_hp_unignore_link_change(struct pci_dev *pdev); +#else +static inline void pci_hp_ignore_link_change(struct pci_dev *pdev) { } +static inline void pci_hp_unignore_link_change(struct pci_dev *pdev) { } +#endif + #ifdef CONFIG_PCIEAER bool pci_aer_available(void); #else -- cgit v1.2.3 From 7c571ac57d9d97190dcba18212fabf99888b0c48 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 14 Apr 2025 14:26:30 -0700 Subject: net: ptp: introduce .supported_extts_flags to ptp_clock_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTP_EXTTS_REQUEST(2) ioctl has a flags field which specifies how the external timestamp request should behave. This includes which edge of the signal to timestamp, as well as a specialized "offset" mode. It is expected that more flags will be added in the future. Driver authors routinely do not check the flags, often accepting requests with flags which they do not support. Even drivers which do check flags may not be future-proofed to reject flags not yet defined. Thus, any future flag additions often require manually updating drivers to reject these flags. This approach of hoping we catch flag checks during review, or playing whack-a-mole after the fact is the wrong approach. Introduce the "supported_extts_flags" field to the ptp_clock_info structure. This field defines the set of flags the device actually supports. Update the core character device logic to check this field and reject unsupported requests. Getting this right is somewhat tricky. First, to avoid unnecessary repetition and make basic functionality work when .supported_extts_flags is 0, the core always accepts the PTP_ENABLE_FEATURE flag. This flag is used to set the 'on' parameter to the .enable function and is thus always 'supported' by all drivers. For backwards compatibility, the PTP_RISING_EDGE and PTP_FALLING_EDGE flags are merely "hints" when using the old PTP_EXTTS_REQUEST ioctl, and are not expected to be enforced. If the user issues PTP_EXTTS_REQUEST2, the PTP_STRICT_FLAGS flag is added which is supposed to inform the driver to strictly validate the flags and reject unsupported requests. To handle this, first check if the driver reports PTP_STRICT_FLAGS support. If it does not, then always allow the PTP_RISING_EDGE and PTP_FALLING_EDGE flags. This keeps backwards compatibility with the original PTP_EXTTS_REQUEST ioctl where these flags are not guaranteed to be honored. This way, drivers which do not set the supported_extts_flags will continue to accept requests for the original PTP_EXTTS_REQUEST ioctl. The core will automatically reject requests with new flags, and correctly reject requests with PTP_STRICT_FLAGS, where the driver is supposed to strictly validate the flags. Update the various drivers, refactoring their validation logic into the .supported_extts_flags field. For consistency and readability, PTP_ENABLE_FEATURE is not set in the supported flags list, and PTP_EXTTS_EDGES is expanded to PTP_RISING_EDGE | PTP_FALLING_EDGE in all cases. Note the following driver files set n_ext_ts to a non-zero value but did not check flags at all: • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c • drivers/net/ethernet/freescale/enetc/enetc_ptp.c • drivers/net/ethernet/intel/i40e/i40e_ptp.c • drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c • drivers/net/ethernet/renesas/ravb_ptp.c • drivers/net/ethernet/renesas/rtsn.c • drivers/net/ethernet/renesas/rtsn.h • drivers/net/ethernet/ti/am65-cpts.c • drivers/net/ethernet/ti/cpts.h • drivers/net/ethernet/ti/icssg/icss_iep.c • drivers/net/ethernet/xscale/ptp_ixp46x.c • drivers/net/phy/bcm-phy-ptp.c • drivers/ptp/ptp_ocp.c • drivers/ptp/ptp_pch.c • drivers/ptp/ptp_qoriq.c These drivers behavior does change slightly: they will now reject the PTP_EXTTS_REQUEST2 ioctl, because they do not strictly validate their flags. This also makes them no longer incorrectly accept PTP_EXT_OFFSET. Also note that the renesas ravb driver does not support PTP_STRICT_FLAGS. We could leave the .supported_extts_flags as 0, but I added the PTP_RISING_EDGE | PTP_FALLING_EDGE since the driver previously manually validated these flags. This is equivalent to 0 because the core will allow these flags regardless unless PTP_STRICT_FLAGS is also set. Signed-off-by: Jacob Keller Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-1-f6b17d15475c@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ptp_clock_kernel.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 0d68d09bedd1..25cba2e5ee69 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -68,6 +68,17 @@ struct ptp_system_timestamp { * @n_per_out: The number of programmable periodic signals. * @n_pins: The number of programmable pins. * @pps: Indicates whether the clock supports a PPS callback. + * + * @supported_extts_flags: The set of flags the driver supports for the + * PTP_EXTTS_REQUEST ioctl. The PTP core will use + * this list to reject unsupported requests. + * PTP_ENABLE_FEATURE is assumed and does not need to + * be included. If PTP_STRICT_FLAGS is *not* set, + * then both PTP_RISING_EDGE and PTP_FALLING_EDGE + * will be assumed. Note that PTP_STRICT_FLAGS must + * be set if the drivers wants to honor + * PTP_EXTTS_REQUEST2 and any future flags. + * * @pin_config: Array of length 'n_pins'. If the number of * programmable pins is nonzero, then drivers must * allocate and initialize this array. @@ -174,6 +185,7 @@ struct ptp_clock_info { int n_per_out; int n_pins; int pps; + unsigned int supported_extts_flags; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); -- cgit v1.2.3 From d9f3e9ecc4562ae07aaf614cf0a6690ef7ca0e10 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 14 Apr 2025 14:26:31 -0700 Subject: net: ptp: introduce .supported_perout_flags to ptp_clock_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTP_PEROUT_REQUEST2 ioctl has gained support for flags specifying specific output behavior including PTP_PEROUT_ONE_SHOT, PTP_PEROUT_DUTY_CYCLE, PTP_PEROUT_PHASE. Driver authors are notorious for not checking the flags of the request. This results in misinterpreting the request, generating an output signal that does not match the requested value. It is anticipated that even more flags will be added in the future, resulting in even more broken requests. Expecting these issues to be caught during review or playing whack-a-mole after the fact is not a great solution. Instead, introduce the supported_perout_flags field in the ptp_clock_info structure. Update the core character device logic to explicitly reject any request which has a flag not on this list. This ensures that drivers must 'opt in' to the flags they support. Drivers which don't set the .supported_perout_flags field will not need to check that unsupported flags aren't passed, as the core takes care of this. Update the drivers which do support flags to set this new field. Note the following driver files set n_per_out to a non-zero value but did not check the flags at all: • drivers/ptp/ptp_clockmatrix.c • drivers/ptp/ptp_idt82p33.c • drivers/ptp/ptp_fc3.c • drivers/net/ethernet/ti/am65-cpts.c • drivers/net/ethernet/aquantia/atlantic/aq_ptp.c • drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c • drivers/net/dsa/sja1105/sja1105_ptp.c • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c • drivers/net/ethernet/mscc/ocelot_vsc7514.c • drivers/net/ethernet/intel/i40e/i40e_ptp.c Reviewed-by: Vadim Fedorenko Signed-off-by: Jacob Keller Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-2-f6b17d15475c@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ptp_clock_kernel.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 25cba2e5ee69..eced7e9bf69a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -69,6 +69,11 @@ struct ptp_system_timestamp { * @n_pins: The number of programmable pins. * @pps: Indicates whether the clock supports a PPS callback. * + * @supported_perout_flags: The set of flags the driver supports for the + * PTP_PEROUT_REQUEST ioctl. The PTP core will + * reject a request with any flag not specified + * here. + * * @supported_extts_flags: The set of flags the driver supports for the * PTP_EXTTS_REQUEST ioctl. The PTP core will use * this list to reject unsupported requests. @@ -185,6 +190,7 @@ struct ptp_clock_info { int n_per_out; int n_pins; int pps; + unsigned int supported_perout_flags; unsigned int supported_extts_flags; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); -- cgit v1.2.3 From 5bb61dc76d11a661c323dee1505b408d18c31565 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 12 Apr 2025 13:37:00 +0800 Subject: crypto: ahash - Remove request chaining Request chaining requires the user to do too much book keeping. Remove it from ahash. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 1e3809d28abd..dd817f56ff0c 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -179,7 +178,6 @@ struct crypto_async_request { struct crypto_tfm *tfm; u32 flags; - int err; }; /** @@ -473,19 +471,6 @@ static inline unsigned int crypto_tfm_ctx_alignment(void) return __alignof__(tfm->__crt_ctx); } -static inline void crypto_reqchain_init(struct crypto_async_request *req) -{ - req->err = -EINPROGRESS; - INIT_LIST_HEAD(&req->list); -} - -static inline void crypto_request_chain(struct crypto_async_request *req, - struct crypto_async_request *head) -{ - req->err = -EINPROGRESS; - list_add_tail(&req->list, &head->list); -} - static inline bool crypto_tfm_is_async(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; -- cgit v1.2.3 From 1451e3e561be9ff4e86b911b9367a2223275d16f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 7 Apr 2025 18:02:51 +0800 Subject: crypto: api - Add helpers to manage request flags Add helpers so that the ON_STACK request flag management is not duplicated all over the place. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index dd817f56ff0c..a387f1547ea0 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -476,5 +476,29 @@ static inline bool crypto_tfm_is_async(struct crypto_tfm *tfm) return tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC; } +static inline bool crypto_req_on_stack(struct crypto_async_request *req) +{ + return req->flags & CRYPTO_TFM_REQ_ON_STACK; +} + +static inline void crypto_request_set_callback( + struct crypto_async_request *req, u32 flags, + crypto_completion_t compl, void *data) +{ + u32 keep = CRYPTO_TFM_REQ_ON_STACK; + + req->complete = compl; + req->data = data; + req->flags &= keep; + req->flags |= flags & ~keep; +} + +static inline void crypto_request_set_tfm(struct crypto_async_request *req, + struct crypto_tfm *tfm) +{ + req->tfm = tfm; + req->flags &= ~CRYPTO_TFM_REQ_ON_STACK; +} + #endif /* _LINUX_CRYPTO_H */ -- cgit v1.2.3 From 6eed1e3552fc076d2617f6793cb148d485696ab6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 7 Apr 2025 18:20:57 +0800 Subject: crypto: api - Mark cra_init/cra_exit as deprecated These functions have been obsoleted by the type-specific init/exit functions. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index a387f1547ea0..56cf229e2530 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -300,17 +300,8 @@ struct cipher_alg { * by @cra_type and @cra_flags above, the associated structure must be * filled with callbacks. This field might be empty. This is the case * for ahash, shash. - * @cra_init: Initialize the cryptographic transformation object. This function - * is used to initialize the cryptographic transformation object. - * This function is called only once at the instantiation time, right - * after the transformation context was allocated. In case the - * cryptographic hardware has some special requirements which need to - * be handled by software, this function shall check for the precise - * requirement of the transformation and put any software fallbacks - * in place. - * @cra_exit: Deinitialize the cryptographic transformation object. This is a - * counterpart to @cra_init, used to remove various changes set in - * @cra_init. + * @cra_init: Deprecated, do not use. + * @cra_exit: Deprecated, do not use. * @cra_u.cipher: Union member which contains a single-block symmetric cipher * definition. See @struct @cipher_alg. * @cra_module: Owner of this transformation implementation. Set to THIS_MODULE -- cgit v1.2.3 From afddce13ce81d52a13898fa0700917835c71acd6 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 7 Apr 2025 18:20:59 +0800 Subject: crypto: api - Add reqsize to crypto_alg Add a reqsize field to crypto_alg with the intention of replacing the type-specific reqsize field currently used by ahash and acomp. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 56cf229e2530..15476b085ce3 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -276,6 +276,7 @@ struct cipher_alg { * to the alignmask of the algorithm being used, in order to * avoid the API having to realign them. Note: the alignmask is * not supported for hash algorithms and is always 0 for them. + * @cra_reqsize: Size of the request context for this algorithm. * @cra_priority: Priority of this transformation implementation. In case * multiple transformations with same @cra_name are available to * the Crypto API, the kernel will use the one with highest @@ -322,6 +323,7 @@ struct crypto_alg { unsigned int cra_blocksize; unsigned int cra_ctxsize; unsigned int cra_alignmask; + unsigned int cra_reqsize; int cra_priority; refcount_t cra_refcnt; @@ -441,6 +443,11 @@ static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm) return tfm->__crt_alg->cra_alignmask; } +static inline unsigned int crypto_tfm_alg_reqsize(struct crypto_tfm *tfm) +{ + return tfm->__crt_alg->cra_reqsize; +} + static inline u32 crypto_tfm_get_flags(struct crypto_tfm *tfm) { return tfm->crt_flags; -- cgit v1.2.3 From f1440a90465bea1993f937ac7add592ce1e4ff44 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 12 Apr 2025 13:16:43 +0800 Subject: crypto: api - Add support for duplicating algorithms before registration If the bit CRYPTO_ALG_DUP_FIRST is set, an algorithm will be duplicated by kmemdup before registration. This is inteded for hardware-based algorithms that may be unplugged at will. Do not use this if the algorithm data structure is embedded in a bigger data structure. Perform the duplication in the driver instead. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 15476b085ce3..b89b1b348095 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -49,6 +49,15 @@ */ #define CRYPTO_ALG_NEED_FALLBACK 0x00000100 +/* + * Set if the algorithm data structure should be duplicated into + * kmalloc memory before registration. This is useful for hardware + * that can be disconnected at will. Do not use this if the data + * structure is embedded into a bigger one. Duplicate the overall + * data structure in the driver in that case. + */ +#define CRYPTO_ALG_DUP_FIRST 0x00000200 + /* * Set if the algorithm has passed automated run-time testing. Note that * if there is no run-time testing for a given algorithm it is considered -- cgit v1.2.3 From 43eca05b6a3b917c600e10cc6b06bfa57fa57401 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Fri, 11 Apr 2025 10:49:56 +0300 Subject: xfrm: Add explicit dev to .xdo_dev_state_{add,delete,free} Previously, device driver IPSec offload implementations would fall into two categories: 1. Those that used xso.dev to determine the offload device. 2. Those that used xso.real_dev to determine the offload device. The first category didn't work with bonding while the second did. In a non-bonding setup the two pointers are the same. This commit adds explicit pointers for the offload netdevice to .xdo_dev_state_add() / .xdo_dev_state_delete() / .xdo_dev_state_free() which eliminates the confusion and allows drivers from the first category to work with bonding. xso.real_dev now becomes a private pointer managed by the bonding driver. Signed-off-by: Cosmin Ratiu Reviewed-by: Leon Romanovsky Reviewed-by: Nikolay Aleksandrov Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8544f6a680c..88dfb8aeed3c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1013,9 +1013,13 @@ struct netdev_bpf { #ifdef CONFIG_XFRM_OFFLOAD struct xfrmdev_ops { - int (*xdo_dev_state_add) (struct xfrm_state *x, struct netlink_ext_ack *extack); - void (*xdo_dev_state_delete) (struct xfrm_state *x); - void (*xdo_dev_state_free) (struct xfrm_state *x); + int (*xdo_dev_state_add)(struct net_device *dev, + struct xfrm_state *x, + struct netlink_ext_ack *extack); + void (*xdo_dev_state_delete)(struct net_device *dev, + struct xfrm_state *x); + void (*xdo_dev_state_free)(struct net_device *dev, + struct xfrm_state *x); bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); -- cgit v1.2.3 From adc8d1200dbbe6726abf89070edb58bca95f1485 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 16 Apr 2025 10:01:36 +0300 Subject: i2c: core: Deprecate of_node in struct i2c_board_info Two members of the same or quite similar semantics is quite confusing to begin with. Moreover, fwnode covers all possible firmware descriptions that Linux kernel supports. Deprecate of_node in struct i2c_board_info, so users will be warned and in the future there is a plan to convert the users and remove it completely. Tested-by: Tomi Valkeinen Reviewed-by: Sakari Ailus Acked-by: Sakari Ailus Signed-off-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 2e4903b7f7bc..cc1437f29823 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -405,7 +405,7 @@ static inline bool i2c_detect_slave_mode(struct device *dev) { return false; } * @addr: stored in i2c_client.addr * @dev_name: Overrides the default - dev_name if set * @platform_data: stored in i2c_client.dev.platform_data - * @of_node: pointer to OpenFirmware device node + * @of_node: **DEPRECATED** - use @fwnode for this * @fwnode: device node supplied by the platform firmware * @swnode: software node for the device * @resources: resources associated with the device -- cgit v1.2.3 From 0e3f6c3696424fa90d6f512779d617a05a1cf031 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 9 Apr 2025 05:34:44 +0000 Subject: sched/topology: Introduce sched_update_asym_prefer_cpu() A subset of AMD Processors supporting Preferred Core Rankings also feature the ability to dynamically switch these rankings at runtime to bias load balancing towards or away from the LLC domain with larger cache. To support dynamically updating "sg->asym_prefer_cpu" without needing to rebuild the sched domain, introduce sched_update_asym_prefer_cpu() which recomutes the "asym_prefer_cpu" when the core-ranking of a CPU changes. sched_update_asym_prefer_cpu() swaps the "sg->asym_prefer_cpu" with the CPU whose ranking has changed if the new ranking is greater than that of the "asym_prefer_cpu". If CPU whose ranking has changed is the current "asym_prefer_cpu", it scans the CPUs of the sched groups to find the new "asym_prefer_cpu" and sets it accordingly. get_group() for non-overlapping sched domains returns the sched group for the first CPU in the sched_group_span() which ensures all CPUs in the group see the updated value of "asym_prefer_cpu". Overlapping groups are allocated differently and will require moving the "asym_prefer_cpu" to "sg->sgc" but since the current implementations do not set "SD_ASYM_PACKING" at NUMA domains, skip additional indirection and place a SCHED_WARN_ON() to alert any future users. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250409053446.23367-3-kprateek.nayak@amd.com --- include/linux/sched/topology.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7b4301b7235f..198bb5cc1774 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -195,6 +195,8 @@ struct sched_domain_topology_level { }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); +extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); + # define SD_INIT_NAME(type) .name = #type @@ -223,6 +225,10 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu) return true; } +static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ +} + #endif /* !CONFIG_SMP */ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -- cgit v1.2.3 From dd09eb0e2cc4ba91cd64dba2b733d3f8e1248fd8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 7 Apr 2025 10:29:35 -0700 Subject: EISA: Increase length of device names GCC 15's -Wunterminated-string-initialization warned about truncated name strings. Instead of marking them with the "nonstring" attribute[1], increase their length to correctly include enough space for the terminating NUL character, as they are used with %s format specifiers when showing resource allocations in /proc/ioports: seq_printf(m, "%*s%0*llx-%0*llx : %s\n", ..., r->name); The strings in eisa.ids have a max length of 73, and the 50 limit was an arbitrary limit that was removed back in 2008 with commit ca52a49846f1 ("driver core: remove DEVICE_NAME_SIZE define"). Change the limit to 74 so nothing is truncated any more. Additionally fix the Makefile to use "if_changed" instead of "cmd" to detect changes to the command line used to generate the target, otherwise devlist.h won't be rebuilt. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117178 [1] Signed-off-by: Kees Cook Acked-by: Alejandro Colomar Link: https://lore.kernel.org/r/20250407172926.it.281-kees@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/eisa.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/eisa.h b/include/linux/eisa.h index f98200cae637..21a2ecc1e538 100644 --- a/include/linux/eisa.h +++ b/include/linux/eisa.h @@ -28,6 +28,9 @@ #define EISA_CONFIG_ENABLED 1 #define EISA_CONFIG_FORCED 2 +/* Chosen to hold the longest string in eisa.ids. */ +#define EISA_DEVICE_INFO_NAME_SIZE 74 + /* There is not much we can say about an EISA device, apart from * signature, slot number, and base address. dma_mask is set by * default to parent device mask..*/ @@ -41,7 +44,7 @@ struct eisa_device { u64 dma_mask; struct device dev; /* generic device */ #ifdef CONFIG_EISA_NAMES - char pretty_name[50]; + char pretty_name[EISA_DEVICE_INFO_NAME_SIZE]; #endif }; -- cgit v1.2.3 From e80c235994fd1d7004a4e5d64123f8f07ec80ade Mon Sep 17 00:00:00 2001 From: Alan Borzeszkowski Date: Mon, 14 Apr 2025 19:55:53 +0200 Subject: thunderbolt: Add Thunderbolt/USB4 <-> USB3 match function This function checks whether given USB4 port device matches with USB3.x port device, using ACPI _DSD property. It is designed to be used by component framework to match USB4 ports with Type-C ports they are connected to. Also, added USB4 config stub in case mapping function is not reachable. Signed-off-by: Alan Borzeszkowski Reviewed-by: Heikki Krogerus Signed-off-by: Mika Westerberg --- include/linux/thunderbolt.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 7d902d8c054b..75247486616b 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -11,6 +11,13 @@ #ifndef THUNDERBOLT_H_ #define THUNDERBOLT_H_ +#include + +struct fwnode_handle; +struct device; + +#if IS_REACHABLE(CONFIG_USB4) + #include #include #include @@ -674,4 +681,15 @@ static inline struct device *tb_ring_dma_device(struct tb_ring *ring) return &ring->nhi->pdev->dev; } +bool usb4_usb3_port_match(struct device *usb4_port_dev, + const struct fwnode_handle *usb3_port_fwnode); + +#else /* CONFIG_USB4 */ +static inline bool usb4_usb3_port_match(struct device *usb4_port_dev, + const struct fwnode_handle *usb3_port_fwnode) +{ + return false; +} +#endif /* CONFIG_USB4 */ + #endif /* THUNDERBOLT_H_ */ -- cgit v1.2.3 From 17240749f26e07cafa676688d8a3326086498447 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:29 +0200 Subject: skb: implement skb_send_sock_locked_with_flags() When sending an skb over a socket using skb_send_sock_locked(), it is currently not possible to specify any flag to be set in msghdr->msg_flags. However, we may want to pass flags the user may have specified, like MSG_NOSIGNAL. Extend __skb_send_sock() with a new argument 'flags' and add a new interface named skb_send_sock_locked_with_flags(). Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-12-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f1381aff0f89..beb084ee4f4d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4145,6 +4145,8 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags); int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); -- cgit v1.2.3 From 13f43d7cf3e0570004a0d960bc1be23db827c2ff Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:53:56 -0300 Subject: iommu/pages: Formalize the freelist API We want to get rid of struct page references outside the internal allocator implementation. The free list has the driver open code something like: list_add_tail(&virt_to_page(ptr)->lru, freelist); Move the above into a small inline and make the freelist into a wrapper type 'struct iommu_pages_list' so that the compiler can help check all the conversion. This struct has also proven helpful in some future ideas to convert to a singly linked list to get an extra pointer in the struct page, and to signal that the pages should be freed with RCU. Use a temporary _Generic so we don't need to rename the free function as the patches progress. Tested-by: Nicolin Chen Tested-by: Alejandro Jimenez Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v4-c8663abbb606+3f7-iommu_pages_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ccce8a751e2a..6dd3a221c5bc 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -341,6 +341,18 @@ typedef unsigned int ioasid_t; /* Read but do not clear any dirty bits */ #define IOMMU_DIRTY_NO_CLEAR (1 << 0) +/* + * Pages allocated through iommu_alloc_pages_node() can be placed on this list + * using iommu_pages_list_add(). Note: ONLY pages from iommu_alloc_pages_node() + * can be used this way! + */ +struct iommu_pages_list { + struct list_head pages; +}; + +#define IOMMU_PAGES_LIST_INIT(name) \ + ((struct iommu_pages_list){ .pages = LIST_HEAD_INIT(name.pages) }) + #ifdef CONFIG_IOMMU_API /** -- cgit v1.2.3 From 868240c34eb113e4121d4ad8ad7458d8caad87d3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:53:59 -0300 Subject: iommu: Change iommu_iotlb_gather to use iommu_page_list This converts the remaining places using list of pages to the new API. The Intel free path was shared with its gather path, so it is converted at the same time. Reviewed-by: Lu Baolu Tested-by: Nicolin Chen Tested-by: Alejandro Jimenez Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/11-v4-c8663abbb606+3f7-iommu_pages_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 6dd3a221c5bc..3fb62165db19 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -375,7 +375,7 @@ struct iommu_iotlb_gather { unsigned long start; unsigned long end; size_t pgsize; - struct list_head freelist; + struct iommu_pages_list freelist; bool queued; }; @@ -864,7 +864,7 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) { *gather = (struct iommu_iotlb_gather) { .start = ULONG_MAX, - .freelist = LIST_HEAD_INIT(gather->freelist), + .freelist = IOMMU_PAGES_LIST_INIT(gather->freelist), }; } -- cgit v1.2.3 From b3efacc451e19a85fb5acb56d95f40532c4e31d2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:54:03 -0300 Subject: iommu/pages: Allow sub page sizes to be passed into the allocator Generally drivers have a specific idea what their HW structure size should be. In a lot of cases this is related to PAGE_SIZE, but not always. ARM64, for example, allows a 4K IO page table size on a 64K CPU page table system. Currently we don't have any good support for sub page allocations, but make the API accommodate this by accepting a sub page size from the caller and rounding up internally. This is done by moving away from order as the size input and using size: size == 1 << (order + PAGE_SHIFT) Following patches convert drivers away from using order and try to specify allocation sizes independent of PAGE_SIZE. Reviewed-by: Lu Baolu Tested-by: Alejandro Jimenez Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v4-c8663abbb606+3f7-iommu_pages_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3fb62165db19..062818b51822 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -342,9 +342,9 @@ typedef unsigned int ioasid_t; #define IOMMU_DIRTY_NO_CLEAR (1 << 0) /* - * Pages allocated through iommu_alloc_pages_node() can be placed on this list - * using iommu_pages_list_add(). Note: ONLY pages from iommu_alloc_pages_node() - * can be used this way! + * Pages allocated through iommu_alloc_pages_node_sz() can be placed on this + * list using iommu_pages_list_add(). Note: ONLY pages from + * iommu_alloc_pages_node_sz() can be used this way! */ struct iommu_pages_list { struct list_head pages; -- cgit v1.2.3 From 163ddf1fea590229c30a8dc4c29ff4febfb895c3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 17 Apr 2025 18:24:46 +0300 Subject: spi: Add spi_bpw_to_bytes() helper and use it This helper converts the given bits per word to bytes. The result will always be power-of-two, e.g., =============== ================= Input (in bits) Output (in bytes) =============== ================= 5 1 9 2 21 4 37 8 =============== ================= It will return 0 for the 0 input. There are a couple of cases in SPI that are using the same approach and at least one more (in IIO) would benefit of it. Add a helper for everyone. Signed-off-by: Andy Shevchenko Reviewed-by: David Lechner Link: https://patch.msgid.link/20250417152529.490582-2-andriy.shevchenko@linux.intel.com Acked-by: Mukesh Kumar Savaliya Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 0ba5e49bace4..5b872fe3b1a2 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1325,6 +1325,32 @@ static inline bool spi_is_bpw_supported(struct spi_device *spi, u32 bpw) return false; } +/** + * spi_bpw_to_bytes - Covert bits per word to bytes + * @bpw: Bits per word + * + * This function converts the given @bpw to bytes. The result is always + * power-of-two, e.g., + * + * =============== ================= + * Input (in bits) Output (in bytes) + * =============== ================= + * 5 1 + * 9 2 + * 21 4 + * 37 8 + * =============== ================= + * + * It will return 0 for the 0 input. + * + * Returns: + * Bytes for the given @bpw. + */ +static inline u32 spi_bpw_to_bytes(u32 bpw) +{ + return roundup_pow_of_two(BITS_TO_BYTES(bpw)); +} + /** * spi_controller_xfer_timeout - Compute a suitable timeout value * @ctlr: SPI device -- cgit v1.2.3 From a1b669ea16c4d7c1a1a8fc7e25aaf651ea0078c3 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:45:57 -0700 Subject: bpf: Prepare to reuse get_ctx_arg_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename get_ctx_arg_idx to bpf_ctx_arg_idx, and allow others to call it. No functional change. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-2-ameryhung@gmail.com --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index ebc0c0c9b944..b2983706292f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -522,6 +522,7 @@ bool btf_param_match_suffix(const struct btf *btf, const char *suffix); int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, u32 arg_no); +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off); struct bpf_verifier_log; -- cgit v1.2.3 From 151e13ece86d234213b7f224f0e26a957c0eeb3e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 15 Apr 2025 18:02:15 -0700 Subject: net: ethtool: Adjust exactly ETH_GSTRING_LEN-long stats to use memcpy Many drivers populate the stats buffer using C-String based APIs (e.g. ethtool_sprintf() and ethtool_puts()), usually when building up the list of stats individually (i.e. with a for() loop). This, however, requires that the source strings be populated in such a way as to have a terminating NUL byte in the source. Other drivers populate the stats buffer directly using one big memcpy() of an entire array of strings. No NUL termination is needed here, as the bytes are being directly passed through. Yet others will build up the stats buffer individually, but also use memcpy(). This, too, does not need NUL termination of the source strings. However, there are cases where the strings that populate the source stats strings are exactly ETH_GSTRING_LEN long, and GCC 15's -Wunterminated-string-initialization option complains that the trailing NUL byte has been truncated. This situation is fine only if the driver is using the memcpy() approach. If the C-String APIs are used, the destination string name will have its final byte truncated by the required trailing NUL byte applied by the C-string API. For drivers that are already using memcpy() but have initializers that truncate the NUL terminator, mark their source strings as __nonstring to silence the GCC warnings. For drivers that have initializers that truncate the NUL terminator and are using the C-String APIs, switch to memcpy() to avoid destination string truncation and mark their source strings as __nonstring to silence the GCC warnings. (Also introduce ethtool_cpy() as a helper to make this an easy replacement). Specifically the following warnings were investigated and addressed: ../drivers/net/ethernet/chelsio/cxgb/cxgb2.c:364:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 364 | "TxFramesAbortedDueToXSCollisions", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:165:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 165 | { ENETC_PM_R1523X(0), "MAC rx 1523 to max-octet packets" }, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:190:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 190 | { ENETC_PM_T1523X(0), "MAC tx 1523 to max-octet packets" }, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/google/gve/gve_ethtool.c:76:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 76 | "adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:117:53: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 117 | STMMAC_STAT(ptp_rx_msg_type_pdelay_follow_up), | ^ ../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:46:12: note: in definition of macro 'STMMAC_STAT' 46 | { #m, sizeof_field(struct stmmac_extra_stats, m), \ | ^ ../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:328:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 328 | .str = "a_mac_control_frames_transmitted", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:340:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 340 | .str = "a_pause_mac_ctrl_frames_received", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Kees Cook Reviewed-by: Petr Machata # for mlxsw Reviewed-by: Harshitha Ramamurthy Link: https://patch.msgid.link/20250416010210.work.904-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 013d25858642..7edb5f5e7134 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1330,6 +1330,17 @@ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); */ extern void ethtool_puts(u8 **data, const char *str); +/** + * ethtool_cpy - Write possibly-not-NUL-terminated string to ethtool string data + * @data: Pointer to a pointer to the start of string to write into + * @str: NUL-byte padded char array of size ETH_GSTRING_LEN to copy from + */ +#define ethtool_cpy(data, str) do { \ + BUILD_BUG_ON(sizeof(str) != ETH_GSTRING_LEN); \ + memcpy(*(data), str, ETH_GSTRING_LEN); \ + *(data) += ETH_GSTRING_LEN; \ +} while (0) + /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; -- cgit v1.2.3 From 22cbc1ee268b7ec0000848708944daa61c6e4909 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Apr 2025 20:04:47 -0700 Subject: netdev: fix the locking for netdev notifications Kuniyuki reports that the assert for netdev lock fires when there are netdev event listeners (otherwise we skip the netlink event generation). Correct the locking when coming from the notifier. The NETDEV_XDP_FEAT_CHANGE notifier is already fully locked, it's the documentation that's incorrect. Fixes: 99e44f39a8f7 ("netdev: depend on netdev->lock for xdp features") Reported-by: syzkaller Reported-by: Kuniyuki Iwashima Link: https://lore.kernel.org/20250410171019.62128-1-kuniyu@amazon.com Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250416030447.1077551-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e6036b82ef4c..0321fd952f70 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2520,7 +2520,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up, @moving_ns, @nd_net, @xdp_flags + * @up, @moving_ns, @nd_net, @xdp_features * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues -- cgit v1.2.3 From 9ff2aa4206eff40a202e425f232036bc84ad4c0e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 17 Mar 2025 23:07:30 -0400 Subject: net: ethtool: mm: extract stmmac verification logic into common library It appears that stmmac is not the only hardware which requires a software-driven verification state machine for the MAC Merge layer. While on the one hand it's good to encourage hardware implementations, on the other hand it's quite difficult to tolerate multiple drivers implementing independently fairly non-trivial logic. Extract the hardware-independent logic from stmmac into library code and put it in ethtool. Name the state structure "mmsv" for MAC Merge Software Verification. Let this expose an operations structure for executing the hardware stuff: sync hardware with the tx_active boolean (result of verification process), enable/disable the pMAC, send mPackets, notify library of external events (reception of mPackets), as well as link state changes. Note that it is assumed that the external events are received in hardirq context. If they are not, it is probably a good idea to disable hardirqs when calling ethtool_mmsv_event_handle(), because the library does not do so. Also, the MM software verification process has no business with the tx_min_frag_size, that is all the driver's to handle. Signed-off-by: Vladimir Oltean Co-developed-by: Choong Yong Liang Signed-off-by: Choong Yong Liang Tested-by: Choong Yong Liang Tested-by: Furong Xu <0x1207@gmail.com> Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Signed-off-by: Tony Nguyen --- include/linux/ethtool.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 7edb5f5e7134..117718c24814 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -17,9 +17,13 @@ #include #include #include +#include #include #include +#define ETHTOOL_MM_MAX_VERIFY_TIME_MS 128 +#define ETHTOOL_MM_MAX_VERIFY_RETRIES 3 + struct compat_ethtool_rx_flow_spec { u32 flow_type; union ethtool_flow_union h_u; @@ -718,6 +722,75 @@ struct ethtool_mm_stats { u64 MACMergeHoldCount; }; +enum ethtool_mmsv_event { + ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET, + ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET, + ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET, +}; + +/* MAC Merge verification mPacket type */ +enum ethtool_mpacket { + ETHTOOL_MPACKET_VERIFY, + ETHTOOL_MPACKET_RESPONSE, +}; + +struct ethtool_mmsv; + +/** + * struct ethtool_mmsv_ops - Operations for MAC Merge Software Verification + * @configure_tx: Driver callback for the event where the preemptible TX + * becomes active or inactive. Preemptible traffic + * classes must be committed to hardware only while + * preemptible TX is active. + * @configure_pmac: Driver callback for the event where the pMAC state + * changes as result of an administrative setting + * (ethtool) or a call to ethtool_mmsv_link_state_handle(). + * @send_mpacket: Driver-provided method for sending a Verify or a Response + * mPacket. + */ +struct ethtool_mmsv_ops { + void (*configure_tx)(struct ethtool_mmsv *mmsv, bool tx_active); + void (*configure_pmac)(struct ethtool_mmsv *mmsv, bool pmac_enabled); + void (*send_mpacket)(struct ethtool_mmsv *mmsv, enum ethtool_mpacket mpacket); +}; + +/** + * struct ethtool_mmsv - MAC Merge Software Verification + * @ops: operations for MAC Merge Software Verification + * @dev: pointer to net_device structure + * @lock: serialize access to MAC Merge state between + * ethtool requests and link state updates. + * @status: current verification FSM state + * @verify_timer: timer for verification in local TX direction + * @verify_enabled: indicates if verification is enabled + * @verify_retries: number of retries for verification + * @pmac_enabled: indicates if the preemptible MAC is enabled + * @verify_time: time for verification in milliseconds + * @tx_enabled: indicates if transmission is enabled + */ +struct ethtool_mmsv { + const struct ethtool_mmsv_ops *ops; + struct net_device *dev; + spinlock_t lock; + enum ethtool_mm_verify_status status; + struct timer_list verify_timer; + bool verify_enabled; + int verify_retries; + bool pmac_enabled; + u32 verify_time; + bool tx_enabled; +}; + +void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv); +void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up); +void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, + enum ethtool_mmsv_event event); +void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, + struct ethtool_mm_state *state); +void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg); +void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, + const struct ethtool_mmsv_ops *ops); + /** * struct ethtool_rxfh_param - RXFH (RSS) parameters * @hfunc: Defines the current RSS hash function used by HW (or to be set to). -- cgit v1.2.3 From 818bd489f137e9257d701c0d4bf11efdfd02ca5f Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Thu, 6 Mar 2025 17:23:25 +0100 Subject: i2c: use client addresses directly in ATR interface The I2C Address Translator (ATR) module defines mappings from i2c_client structs to aliases. However, only the physical address of each i2c_client struct is actually relevant to the workings of the ATR module. Moreover, some drivers require address translation functionality but do not allocate i2c_client structs, accessing the adapter directly instead. The SFP subsystem is an example of this. Replace the "i2c_client" field of the i2c_atr_alias_pair struct with a u16 "addr" field. Rewrite helper functions and callbacks as needed. Reviewed-by: Tomi Valkeinen Tested-by: Tomi Valkeinen Signed-off-by: Romain Gantois Acked-by: Andi Shyti Signed-off-by: Wolfram Sang --- include/linux/i2c-atr.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c-atr.h b/include/linux/i2c-atr.h index 4d5da161c225..14c1f9175c0d 100644 --- a/include/linux/i2c-atr.h +++ b/include/linux/i2c-atr.h @@ -20,20 +20,20 @@ struct i2c_atr; /** * struct i2c_atr_ops - Callbacks from ATR to the device driver. - * @attach_client: Notify the driver of a new device connected on a child - * bus, with the alias assigned to it. The driver must - * configure the hardware to use the alias. - * @detach_client: Notify the driver of a device getting disconnected. The - * driver must configure the hardware to stop using the - * alias. + * @attach_addr: Notify the driver of a new device connected on a child + * bus, with the alias assigned to it. The driver must + * configure the hardware to use the alias. + * @detach_addr: Notify the driver of a device getting disconnected. The + * driver must configure the hardware to stop using the + * alias. * * All these functions return 0 on success, a negative error code otherwise. */ struct i2c_atr_ops { - int (*attach_client)(struct i2c_atr *atr, u32 chan_id, - const struct i2c_client *client, u16 alias); - void (*detach_client)(struct i2c_atr *atr, u32 chan_id, - const struct i2c_client *client); + int (*attach_addr)(struct i2c_atr *atr, u32 chan_id, + u16 addr, u16 alias); + void (*detach_addr)(struct i2c_atr *atr, u32 chan_id, + u16 addr); }; /** -- cgit v1.2.3 From 328a106ce0e8d0596d2b020b6f83efd0c859b084 Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Thu, 6 Mar 2025 17:23:28 +0100 Subject: i2c: support per-channel ATR alias pools Some I2C address translators (ATRs) assign each of their remote peripheral aliases to a specific channel. To properly handle these devices, add support for having separate alias pools for each ATR channel. This is achieved by allowing callers of i2c_atr_add_adapter to pass an optional alias list. If present, this list will be used to populate the channel's alias pool. Otherwise, the common alias pool will be used. Tested-by: Tomi Valkeinen Signed-off-by: Romain Gantois Acked-by: Andi Shyti Signed-off-by: Wolfram Sang --- include/linux/i2c-atr.h | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c-atr.h b/include/linux/i2c-atr.h index 14c1f9175c0d..1c3a5bcd939f 100644 --- a/include/linux/i2c-atr.h +++ b/include/linux/i2c-atr.h @@ -36,6 +36,29 @@ struct i2c_atr_ops { u16 addr); }; +/** + * struct i2c_atr_adap_desc - An ATR downstream bus descriptor + * @chan_id: Index of the new adapter (0 .. max_adapters-1). This value is + * passed to the callbacks in `struct i2c_atr_ops`. + * @parent: The device used as the parent of the new i2c adapter, or NULL + * to use the i2c-atr device as the parent. + * @bus_handle: The fwnode handle that points to the adapter's i2c + * peripherals, or NULL. + * @num_aliases: The number of aliases in this adapter's private alias pool. Set + * to zero if this adapter uses the ATR's global alias pool. + * @aliases: An optional array of private aliases used by the adapter + * instead of the ATR's global pool of aliases. Must contain + * exactly num_aliases entries if num_aliases > 0, is ignored + * otherwise. + */ +struct i2c_atr_adap_desc { + u32 chan_id; + struct device *parent; + struct fwnode_handle *bus_handle; + size_t num_aliases; + u16 *aliases; +}; + /** * i2c_atr_new() - Allocate and initialize an I2C ATR helper. * @parent: The parent (upstream) adapter @@ -65,12 +88,7 @@ void i2c_atr_delete(struct i2c_atr *atr); /** * i2c_atr_add_adapter - Create a child ("downstream") I2C bus. * @atr: The I2C ATR - * @chan_id: Index of the new adapter (0 .. max_adapters-1). This value is - * passed to the callbacks in `struct i2c_atr_ops`. - * @adapter_parent: The device used as the parent of the new i2c adapter, or NULL - * to use the i2c-atr device as the parent. - * @bus_handle: The fwnode handle that points to the adapter's i2c - * peripherals, or NULL. + * @desc: An ATR adapter descriptor * * After calling this function a new i2c bus will appear. Adding and removing * devices on the downstream bus will result in calls to the @@ -85,9 +103,7 @@ void i2c_atr_delete(struct i2c_atr *atr); * * Return: 0 on success, a negative error code otherwise. */ -int i2c_atr_add_adapter(struct i2c_atr *atr, u32 chan_id, - struct device *adapter_parent, - struct fwnode_handle *bus_handle); +int i2c_atr_add_adapter(struct i2c_atr *atr, struct i2c_atr_adap_desc *desc); /** * i2c_atr_del_adapter - Remove a child ("downstream") I2C bus added by -- cgit v1.2.3 From e8866e26f5e8ec551eec73dca1c30d458f9dca86 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 11 Apr 2025 07:59:09 +0200 Subject: ata: libata-core: Simplify ata_print_version_once Use dev_dbg_once() instead of open-coding the once functionality. Signed-off-by: Heiner Kallweit Signed-off-by: Damien Le Moal --- include/linux/libata.h | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index e5695998acb0..98affa1d9136 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -41,17 +41,6 @@ */ #undef ATA_IRQ_TRAP /* define to ack screaming irqs */ - -#define ata_print_version_once(dev, version) \ -({ \ - static bool __print_once; \ - \ - if (!__print_once) { \ - __print_once = true; \ - ata_print_version(dev, version); \ - } \ -}) - /* defines only for the constants which don't work well as enums */ #define ATA_TAG_POISON 0xfafbfcfdU @@ -1593,7 +1582,11 @@ do { \ #define ata_dev_dbg(dev, fmt, ...) \ ata_dev_printk(debug, dev, fmt, ##__VA_ARGS__) -void ata_print_version(const struct device *dev, const char *version); +static inline void ata_print_version_once(const struct device *dev, + const char *version) +{ + dev_dbg_once(dev, "version %s\n", version); +} /* * ata_eh_info helpers -- cgit v1.2.3 From 296b67059e3026125a1ca942f5506e6ca051749e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 11 Apr 2025 23:31:40 +0800 Subject: fs/fs_parse: Delete macro fsparam_u32hex() Delete macro fsparam_u32hex() since: - it has no caller. - it uses as type @fs_param_is_u32_hex which is never defined, so will cause compile error when caller uses it. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/20250411-fix_fs-v2-1-5d3395c102e4@quicinc.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs_parser.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 53e566efd5fd..5057faf4f091 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -125,8 +125,6 @@ static inline bool fs_validate_description(const char *name, #define fsparam_u32(NAME, OPT) __fsparam(fs_param_is_u32, NAME, OPT, 0, NULL) #define fsparam_u32oct(NAME, OPT) \ __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8) -#define fsparam_u32hex(NAME, OPT) \ - __fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *)16) #define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL) #define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL) #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) -- cgit v1.2.3 From d1f482108a2cff2b9c6ebebc40b157aaeb8213b3 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 15 Apr 2025 20:25:00 +0800 Subject: fs/fs_parse: Remove unused and problematic validate_constant_table() Remove validate_constant_table() since: - It has no caller. - It has below 3 bugs for good constant table array array[] which must end with a empty entry, and take below invocation for explaination: validate_constant_table(array, ARRAY_SIZE(array), ...) - Always return wrong value due to the last empty entry. - Imprecise error message for missorted case. - Potential NULL pointer dereference since the last pr_err() may use @tbl[i].name NULL pointer to print the last empty entry's name. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/20250415-fix_fs-v4-1-5d575124a3ff@quicinc.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs_parser.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 5057faf4f091..5a0e897cae80 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -87,14 +87,9 @@ extern int lookup_constant(const struct constant_table tbl[], const char *name, extern const struct constant_table bool_names[]; #ifdef CONFIG_VALIDATE_FS_PARSER -extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, - int low, int high, int special); extern bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc); #else -static inline bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, - int low, int high, int special) -{ return true; } static inline bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc) { return true; } -- cgit v1.2.3 From 79beea2db0431536d79fc5d321225fb42f955466 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 15 Apr 2025 10:27:50 +0200 Subject: fs: remove uselib() system call This system call has been deprecated for quite a while now. Let's try and remove it from the kernel completely. Link: https://lore.kernel.org/20250415-kanufahren-besten-02ac00e6becd@brauner Acked-by: Kees Cook Signed-off-by: Christian Brauner --- include/linux/binfmts.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 1625c8529e70..65abd5ab8836 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -90,7 +90,6 @@ struct linux_binfmt { struct list_head lh; struct module *module; int (*load_binary)(struct linux_binprm *); - int (*load_shlib)(struct file *); #ifdef CONFIG_COREDUMP int (*core_dump)(struct coredump_params *cprm); unsigned long min_coredump; /* minimal dump size */ -- cgit v1.2.3 From 4ef4ac360101f8bb11b6486ce60cd60ca015be8c Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 17 Apr 2025 00:16:26 +0200 Subject: device_cgroup: avoid access to ->i_rdev in the common case in devcgroup_inode_permission() The routine gets called for every path component during lookup. ->i_mode is going to be cached on account of permission checks, while ->i_rdev is an area which is most likely cache-cold. gcc 14.2 is kind enough to emit one branch: movzwl (%rbx),%eax mov %eax,%edx and $0xb000,%dx cmp $0x2000,%dx je 11bc This patch is lazy in that I don't know if the ->i_rdev branch makes any sense with the newly added mode check upfront. I am not changing any semantics here though. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/20250416221626.2710239-3-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/device_cgroup.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index d02f32b7514e..0864773a57e8 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -18,15 +18,16 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask) { short type, access = 0; + if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))) + return 0; + if (likely(!inode->i_rdev)) return 0; if (S_ISBLK(inode->i_mode)) type = DEVCG_DEV_BLOCK; - else if (S_ISCHR(inode->i_mode)) + else /* S_ISCHR by the test above */ type = DEVCG_DEV_CHAR; - else - return 0; if (mask & MAY_WRITE) access |= DEVCG_ACC_WRITE; -- cgit v1.2.3 From bd32923e5f02fa7b04d487ec265dc8080d27a257 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 31 Mar 2025 17:18:02 +0100 Subject: io_uring: don't store bgid in req->buf_index Pass buffer group id into the rest of helpers via struct buf_sel_arg and remove all reassignments of req->buf_index back to bgid. Now, it only stores buffer indexes, and the group is provided by callers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3ea9fa08113ecb4d9224b943e7806e80a324bdf9.1743437358.git.asml.silence@gmail.com Link: https://lore.kernel.org/io-uring/0c01d76ff12986c2f48614db8610caff8f78c869.1743500909.git.asml.silence@gmail.com/ [axboe: fold in patch from second link] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b44d201520d8..3b467879bca8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -653,8 +653,7 @@ struct io_kiocb { u8 iopoll_completed; /* * Can be either a fixed buffer index, or used with provided buffers. - * For the latter, before issue it points to the buffer group ID, - * and after selection it points to the buffer ID itself. + * For the latter, it points to the selected buffer ID. */ u16 buf_index; -- cgit v1.2.3 From 632b3186726984319e2337987de86a442407f30e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:19 +0100 Subject: io_uring/zcrx: move zcrx region to struct io_zcrx_ifq Refill queue region is a part of zcrx and should stay in struct io_zcrx_ifq. We can't have multiple queues without it, so move it there. As a result there is no context global zcrx region anymore, and the region is looked up together with its ifq. To protect a concurrent mmap from seeing an inconsistent region we were protecting changes to ->zcrx_region with mmap_lock, but now it protect the publishing of the ifq. Reviewed-by: David Wei Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/24f1a728fc03d0166f16d099575457e10d9d90f2.1745141261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3b467879bca8..06d722289fc5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -448,8 +448,6 @@ struct io_ring_ctx { struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; - /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ - struct io_mapped_region zcrx_region; }; /* -- cgit v1.2.3 From 19bbfe7b5fcc04d8711e8e1352acc77c1a5c3955 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 21 Apr 2025 10:27:40 +0200 Subject: fs: add S_ANON_INODE This makes it easy to detect proper anonymous inodes and to ensure that we can detect them in codepaths such as readahead(). Readahead on anonymous inodes didn't work because they didn't have a proper mode. Now that they have we need to retain EINVAL being returned otherwise LTP will fail. We also need to ensure that ioctls aren't simply fired like they are for regular files so things like inotify inodes continue to correctly call their own ioctl handlers as in [1]. Reported-by: Xilin Wu Link: https://lore.kernel.org/3A9139D5CD543962+89831381-31b9-4392-87ec-a84a5b3507d8@radxa.com [1] Link: https://lore.kernel.org/7a1a7076-ff6b-4cb0-94e7-7218a0a44028@sirena.org.uk Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..cf7208500cee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2344,6 +2344,7 @@ struct super_operations { #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ #define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */ +#define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -2400,6 +2401,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) +#define IS_ANON_FILE(inode) ((inode)->i_flags & S_ANON_INODE) static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap, struct inode *inode) -- cgit v1.2.3 From 4e2c719782a84702db7fc2dc07ced796f308fec7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Apr 2025 08:32:47 +0200 Subject: x86/cpu: Help users notice when running old Intel microcode Old microcode is bad for users and for kernel developers. For users, it exposes them to known fixed security and/or functional issues. These obviously rarely result in instant dumpster fires in every environment. But it is as important to keep your microcode up to date as it is to keep your kernel up to date. Old microcode also makes kernels harder to debug. A developer looking at an oops need to consider kernel bugs, known CPU issues and unknown CPU issues as possible causes. If they know the microcode is up to date, they can mostly eliminate known CPU issues as the cause. Make it easier to tell if CPU microcode is out of date. Add a list of released microcode. If the loaded microcode is older than the release, tell users in a place that folks can find it: /sys/devices/system/cpu/vulnerabilities/old_microcode Tell kernel kernel developers about it with the existing taint flag: TAINT_CPU_OUT_OF_SPEC == Discussion == When a user reports a potential kernel issue, it is very common to ask them to reproduce the issue on mainline. Running mainline, they will (independently from the distro) acquire a more up-to-date microcode version list. If their microcode is old, they will get a warning about the taint and kernel developers can take that into consideration when debugging. Just like any other entry in "vulnerabilities/", users are free to make their own assessment of their exposure. == Microcode Revision Discussion == The microcode versions in the table were generated from the Intel microcode git repo: 8ac9378a8487 ("microcode-20241112 Release") which as of this writing lags behind the latest microcode-20250211. It can be argued that the versions that the kernel picks to call "old" should be a revision or two old. Which specific version is picked is less important to me than picking *a* version and enforcing it. This repository contains only microcode versions that Intel has deemed to be OS-loadable. It is quite possible that the BIOS has loaded a newer microcode than the latest in this repo. If this happens, the system is considered to have new microcode, not old. Specifically, the sysfs file and taint flag answer the question: Is the CPU running on the latest OS-loadable microcode, or something even later that the BIOS loaded? In other words, Intel never publishes an authoritative list of CPUs and latest microcode revisions. Until it does, this is the best that Linux can do. Also note that the "intel-ucode-defs.h" file is simple, ugly and has lots of magic numbers. That's on purpose and should allow a single file to be shared across lots of stable kernel regardless of if they have the new "VFM" infrastructure or not. It was generated with a dumb script. == FAQ == Q: Does this tell me if my system is secure or insecure? A: No. It only tells you if your microcode was old when the system booted. Q: Should the kernel warn if the microcode list itself is too old? A: No. New kernels will get new microcode lists, both mainline and stable. The only way to have an old list is to be running an old kernel in which case you have bigger problems. Q: Is this for security or functional issues? A: Both. Q: If a given microcode update only has functional problems but no security issues, will it be considered old? A: Yes. All microcode image versions within a microcode release are treated identically. Intel appears to make security updates without disclosing them in the release notes. Thus, all updates are considered to be security-relevant. Q: Who runs old microcode? A: Anybody with an old distro. This happens all the time inside of Intel where there are lots of weird systems in labs that might not be getting regular distro updates and might also be running rather exotic microcode images. Q: If I update my microcode after booting will it stop saying "Vulnerable"? A: No. Just like all the other vulnerabilies, you need to reboot before the kernel will reassess your vulnerability. Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Cc: "Ahmed S. Darwish" Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Brian Gerst Cc: John Ogness Cc: Josh Poimboeuf Cc: Juergen Gross Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Link: https://lore.kernel.org/all/20250421195659.CF426C07%40davehans-spike.ostc.intel.com (cherry picked from commit 9127865b15eb0a1bd05ad7efe29489c44394bdc1) --- include/linux/cpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e3049543008b..1f5cfc4cc04f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -78,6 +78,8 @@ extern ssize_t cpu_show_gds(struct device *dev, extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_ghostwrite(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_old_microcode(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From a8dc26a0ec43fd416ca781a8030807e65f71cfc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Thu, 27 Mar 2025 12:54:28 +0000 Subject: firmware: exynos-acpm: introduce devm_acpm_get_by_node() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To allow ACPM clients to simply be children of the ACPM node in DT, they need to be able to get the ACPM handle based on that ACPM node directly. Add an API to allow them to do so, devm_acpm_get_by_node(). At the same time, the previous approach of acquiring the ACPM handle via a DT phandle is now obsolete and we can remove devm_acpm_get_by_phandle(), which was there to facilitate that. There are no existing or anticipated upcoming users of that API, because all clients should be children of the ACPM node going forward. Note that no DTs have been merged that use the old approach, so doing this API change in this driver now will not affect any existing DTs or client drivers. Signed-off-by: André Draszik Link: https://lore.kernel.org/r/20250327-acpm-children-v1-2-0afe15ee2ff7@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/firmware/samsung/exynos-acpm-protocol.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index 76255b5d06b1..f628bf1862c2 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -11,6 +11,7 @@ #include struct acpm_handle; +struct device_node; struct acpm_pmic_ops { int (*read_reg)(const struct acpm_handle *handle, @@ -44,6 +45,7 @@ struct acpm_handle { struct device; -const struct acpm_handle *devm_acpm_get_by_phandle(struct device *dev, - const char *property); +const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np); + #endif /* __EXYNOS_ACPM_PROTOCOL_H */ -- cgit v1.2.3 From f24303631489d250f330373a59b3412103a93b67 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 24 Mar 2025 09:12:50 +0200 Subject: property: Add functions to iterate named child There are a few use-cases where child nodes with a specific name need to be parsed. Code like: fwnode_for_each_child_node() if (fwnode_name_eq()) ... can be found from a various drivers/subsystems. Adding a macro for this can simplify things a bit. In a few cases the data from the found nodes is later added to an array, which is allocated based on the number of found nodes. One example of such use is the IIO subsystem's ADC channel nodes, where the relevant nodes are named as channel[@N]. Add helpers for iterating and counting device's sub-nodes with certain name instead of open-coding this in every user. Suggested-by: Jonathan Cameron Signed-off-by: Matti Vaittinen Reviewed-by: Andy Shevchenko Reviewed-by: Marcelo Schmitt Link: https://patch.msgid.link/2767173b7b18e974c0bac244688214bd3863ff06.1742560649.git.mazziesaccount@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/property.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index e214ecd241eb..3e83babac0b0 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -167,6 +167,10 @@ struct fwnode_handle *fwnode_get_next_available_child_node( for (child = fwnode_get_next_child_node(fwnode, NULL); child; \ child = fwnode_get_next_child_node(fwnode, child)) +#define fwnode_for_each_named_child_node(fwnode, child, name) \ + fwnode_for_each_child_node(fwnode, child) \ + if (!fwnode_name_eq(child, name)) { } else + #define fwnode_for_each_available_child_node(fwnode, child) \ for (child = fwnode_get_next_available_child_node(fwnode, NULL); child;\ child = fwnode_get_next_available_child_node(fwnode, child)) @@ -178,11 +182,19 @@ struct fwnode_handle *device_get_next_child_node(const struct device *dev, for (child = device_get_next_child_node(dev, NULL); child; \ child = device_get_next_child_node(dev, child)) +#define device_for_each_named_child_node(dev, child, name) \ + device_for_each_child_node(dev, child) \ + if (!fwnode_name_eq(child, name)) { } else + #define device_for_each_child_node_scoped(dev, child) \ for (struct fwnode_handle *child __free(fwnode_handle) = \ device_get_next_child_node(dev, NULL); \ child; child = device_get_next_child_node(dev, child)) +#define device_for_each_named_child_node_scoped(dev, child, name) \ + device_for_each_child_node_scoped(dev, child) \ + if (!fwnode_name_eq(child, name)) { } else + struct fwnode_handle *fwnode_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname); struct fwnode_handle *device_get_named_child_node(const struct device *dev, @@ -210,6 +222,14 @@ int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name); unsigned int device_get_child_node_count(const struct device *dev); +unsigned int fwnode_get_named_child_node_count(const struct fwnode_handle *fwnode, + const char *name); +static inline unsigned int device_get_named_child_node_count(const struct device *dev, + const char *name) +{ + return fwnode_get_named_child_node_count(dev_fwnode(dev), name); +} + static inline int device_property_read_u8(const struct device *dev, const char *propname, u8 *val) { -- cgit v1.2.3 From f3a8f870fa9c84b34aad4c72c8d0a1213ba13a36 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 24 Mar 2025 09:13:03 +0200 Subject: iio: adc: add helpers for parsing ADC nodes There are ADC ICs which may have some of the AIN pins usable for other functions. These ICs may have some of the AIN pins wired so that they should not be used for ADC. A common way of marking pins that can be used as ADC inputs is to add corresponding channel@N nodes in the device tree as described in the ADC binding yaml. Add couple of helper functions which can be used to retrieve the channel information from the device node. Signed-off-by: Matti Vaittinen Reviewed-by: Andy Shevchenko Reviewed-by: Marcelo Schmitt Link: https://patch.msgid.link/f1d8b3e15237947738912c0d297b3e1e21d8b03e.1742560649.git.mazziesaccount@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc-helpers.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 include/linux/iio/adc-helpers.h (limited to 'include/linux') diff --git a/include/linux/iio/adc-helpers.h b/include/linux/iio/adc-helpers.h new file mode 100644 index 000000000000..56b092a2a4c4 --- /dev/null +++ b/include/linux/iio/adc-helpers.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/* + * The industrial I/O ADC firmware property parsing helpers + * + * Copyright (c) 2025 Matti Vaittinen + */ + +#ifndef _INDUSTRIAL_IO_ADC_HELPERS_H_ +#define _INDUSTRIAL_IO_ADC_HELPERS_H_ + +#include + +struct device; +struct iio_chan_spec; + +static inline int iio_adc_device_num_channels(struct device *dev) +{ + return device_get_named_child_node_count(dev, "channel"); +} + +int devm_iio_adc_device_alloc_chaninfo_se(struct device *dev, + const struct iio_chan_spec *template, + int max_chan_id, + struct iio_chan_spec **cs); + +#endif /* _INDUSTRIAL_IO_ADC_HELPERS_H_ */ -- cgit v1.2.3 From cfed1969fcfe69d0b0db780efe2b6fedeaf7f2b3 Mon Sep 17 00:00:00 2001 From: Olivier Moysan Date: Fri, 14 Mar 2025 18:14:46 +0100 Subject: iio: trigger: stm32-lptimer: add support for stm32mp25 Add support for STM32MP25 SoC. Use newly introduced compatible to handle this new HW variant. Add new trigger definitions that can be used by the stm32 analog-to-digital converter. Use compatible data to identify them. Signed-off-by: Olivier Moysan Signed-off-by: Fabrice Gasnier Link: https://patch.msgid.link/20250314171451.3497789-4-fabrice.gasnier@foss.st.com Signed-off-by: Jonathan Cameron --- include/linux/iio/timer/stm32-lptim-trigger.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/timer/stm32-lptim-trigger.h b/include/linux/iio/timer/stm32-lptim-trigger.h index a34dcf6a6001..ce3cf0addb2e 100644 --- a/include/linux/iio/timer/stm32-lptim-trigger.h +++ b/include/linux/iio/timer/stm32-lptim-trigger.h @@ -14,6 +14,15 @@ #define LPTIM1_OUT "lptim1_out" #define LPTIM2_OUT "lptim2_out" #define LPTIM3_OUT "lptim3_out" +#define LPTIM4_OUT "lptim4_out" +#define LPTIM5_OUT "lptim5_out" + +#define LPTIM1_CH1 "lptim1_ch1" +#define LPTIM1_CH2 "lptim1_ch2" +#define LPTIM2_CH1 "lptim2_ch1" +#define LPTIM2_CH2 "lptim2_ch2" +#define LPTIM3_CH1 "lptim3_ch1" +#define LPTIM4_CH1 "lptim4_ch1" #if IS_REACHABLE(CONFIG_IIO_STM32_LPTIMER_TRIGGER) bool is_stm32_lptim_trigger(struct iio_trigger *trig); -- cgit v1.2.3 From 5d1dff5b45b7e81206ada46ae996f58129a68a7c Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 31 Mar 2025 13:13:17 +0100 Subject: iio: Adjust internals of handling of direct mode claiming to suit new API. Now there are no remaining callers of iio_device_claim_direct_mode() and iio_device_release_direct_mode() rename those functions to ensure they are not used in new drivers. Also make them now return booleans in line with the sparse friendly static inline wrappers. Reviewed-by: David Lechner Reviewed-by: Marcelo Schmitt Link: https://patch.msgid.link/20250331121317.1694135-38-jic23@kernel.org Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 07a0e8132e88..638cf2420fbd 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -659,8 +659,8 @@ void iio_device_unregister(struct iio_dev *indio_dev); int __devm_iio_device_register(struct device *dev, struct iio_dev *indio_dev, struct module *this_mod); int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp); -int iio_device_claim_direct_mode(struct iio_dev *indio_dev); -void iio_device_release_direct_mode(struct iio_dev *indio_dev); +bool __iio_device_claim_direct(struct iio_dev *indio_dev); +void __iio_device_release_direct(struct iio_dev *indio_dev); /* * Helper functions that allow claim and release of direct mode @@ -671,9 +671,7 @@ void iio_device_release_direct_mode(struct iio_dev *indio_dev); */ static inline bool iio_device_claim_direct(struct iio_dev *indio_dev) { - int ret = iio_device_claim_direct_mode(indio_dev); - - if (ret) + if (!__iio_device_claim_direct(indio_dev)) return false; __acquire(iio_dev); @@ -683,7 +681,7 @@ static inline bool iio_device_claim_direct(struct iio_dev *indio_dev) static inline void iio_device_release_direct(struct iio_dev *indio_dev) { - iio_device_release_direct_mode(indio_dev); + __iio_device_release_direct(indio_dev); __release(indio_dev); } -- cgit v1.2.3 From 2086321576425361e8ea0052d9ef6eec55f99a6f Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Wed, 9 Apr 2025 20:36:30 +0200 Subject: iio: backend: add support for data source get MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add backend support for getting the data source used. The ad3552r HDL implements an internal ramp generator, so adding the getter to allow data source get/set by debugfs. Reviewed-by: Nuno Sá Signed-off-by: Angelo Dureghello Link: https://patch.msgid.link/20250409-wip-bl-ad3552r-fixes-v5-3-fb429c3a6515@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index e45b7dfbec35..e59d909cb659 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -84,6 +84,7 @@ enum iio_backend_interface_type { * @chan_disable: Disable one channel. * @data_format_set: Configure the data format for a specific channel. * @data_source_set: Configure the data source for a specific channel. + * @data_source_get: Data source getter for a specific channel. * @set_sample_rate: Configure the sampling rate for a specific channel. * @test_pattern_set: Configure a test pattern. * @chan_status: Get the channel status. @@ -115,6 +116,8 @@ struct iio_backend_ops { const struct iio_backend_data_fmt *data); int (*data_source_set)(struct iio_backend *back, unsigned int chan, enum iio_backend_data_source data); + int (*data_source_get)(struct iio_backend *back, unsigned int chan, + enum iio_backend_data_source *data); int (*set_sample_rate)(struct iio_backend *back, unsigned int chan, u64 sample_rate_hz); int (*test_pattern_set)(struct iio_backend *back, @@ -176,6 +179,8 @@ int iio_backend_data_format_set(struct iio_backend *back, unsigned int chan, const struct iio_backend_data_fmt *data); int iio_backend_data_source_set(struct iio_backend *back, unsigned int chan, enum iio_backend_data_source data); +int iio_backend_data_source_get(struct iio_backend *back, unsigned int chan, + enum iio_backend_data_source *data); int iio_backend_set_sampling_freq(struct iio_backend *back, unsigned int chan, u64 sample_rate_hz); int iio_backend_test_pattern_set(struct iio_backend *back, -- cgit v1.2.3 From 74e5b13a1b0f10c5a5c6168f6915620a1d369fae Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 21 Apr 2025 15:52:50 -0700 Subject: lsm: Move security_netlink_send to under CONFIG_SECURITY_NETWORK security_netlink_send() is a networking hook, so it fits better under CONFIG_SECURITY_NETWORK. Signed-off-by: Song Liu Signed-off-by: Paul Moore --- include/linux/security.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index cc9b54d95d22..dba349629229 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -563,7 +563,6 @@ int security_setselfattr(unsigned int attr, struct lsm_ctx __user *ctx, int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value); int security_setprocattr(int lsmid, const char *name, void *value, size_t size); -int security_netlink_send(struct sock *sk, struct sk_buff *skb); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, struct lsm_context *cp); int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp); @@ -1527,11 +1526,6 @@ static inline int security_setprocattr(int lsmid, char *name, void *value, return -EINVAL; } -static inline int security_netlink_send(struct sock *sk, struct sk_buff *skb) -{ - return 0; -} - static inline int security_ismaclabel(const char *name) { return 0; @@ -1629,6 +1623,7 @@ static inline int security_watch_key(struct key *key) #ifdef CONFIG_SECURITY_NETWORK +int security_netlink_send(struct sock *sk, struct sk_buff *skb); int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk); int security_unix_may_send(struct socket *sock, struct socket *other); int security_socket_create(int family, int type, int protocol, int kern); @@ -1684,6 +1679,11 @@ int security_sctp_assoc_established(struct sctp_association *asoc, int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk); #else /* CONFIG_SECURITY_NETWORK */ +static inline int security_netlink_send(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + static inline int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) -- cgit v1.2.3 From 145436ae01193c0a379fd3ea9c4fbdf32863db1f Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Wed, 16 Apr 2025 19:14:49 +0200 Subject: net: phy: Add helper for getting MAC termination resistance Add helper which returns the MAC termination resistance value. Modifying the resistance to an appropriate value can reduce signal reflections and therefore improve signal quality. Reviewed-by: Russell King (Oracle) Signed-off-by: Dimitri Fedrau Link: https://patch.msgid.link/20250416-dp83822-mac-impedance-v3-3-028ac426cddb@liebherr.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index fb755358d965..066a28a4b64b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2040,6 +2040,9 @@ int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, enum ethtool_link_mode_bit_indices linkmode, u32 *val); +int phy_get_mac_termination(struct phy_device *phydev, struct device *dev, + u32 *val); + void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); -- cgit v1.2.3 From 7650f826f7b2d84782f9147c51687ff0364125e9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 18 Apr 2025 10:58:41 +0800 Subject: crypto: shash - Handle partial blocks in API Provide an option to handle the partial blocks in the shash API. Almost every hash algorithm has a block size and are only able to hash partial blocks on finalisation. Rather than duplicating the partial block handling many times, add this functionality to the shash API. It is optional (e.g., hmac would never need this by relying on the partial block handling of the underlying hash), and to enable it set the bit CRYPTO_AHASH_ALG_BLOCK_ONLY. The export format is always that of the underlying hash export, plus the partial block buffer, followed by a single-byte for the partial block length. Set the bit CRYPTO_AHASH_ALG_FINAL_NONZERO to withhold an extra byte in the partial block. This will come in handy when this is extended to ahash where hardware often can't deal with a zero-length final. It will also be used for algorithms requiring an extra block for finalisation (e.g., cmac). As an optimisation, set the bit CRYPTO_AHASH_ALG_FINUP_MAX if the algorithm wishes to get as much data as possible instead of just the last partial block. The descriptor will be zeroed after finalisation. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index b89b1b348095..f691ce01745e 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -136,6 +136,8 @@ /* Set if the algorithm supports request chains and virtual addresses. */ #define CRYPTO_ALG_REQ_CHAIN 0x00040000 +/* The high bits 0xff000000 are reserved for type-specific flags. */ + /* * Transform masks and values (for crt_flags). */ -- cgit v1.2.3 From afc0a570bb6138713d74784a032e4eb5b7f919ca Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Apr 2025 10:17:03 +0100 Subject: PCI: host-generic: Extract an ECAM bridge creation helper from pci_host_common_probe() pci_host_common_probe() is an extremely useful helper, as it abstracts away most of the gunk that a "mostly-ECAM-compliant" device driver needs. However, it is structured as a probe function, meaning that a lot of the driver-specific setup has to happen in a .init() callback, after the bridge and config space have been instantiated. This is a bit awkward, and results in a number of convolutions that could be avoided if the host-common code was more like a library. Introduce a pci_host_common_init() helper that does exactly that, taking the platform device and a struct pci_ecam_op as parameters. This can then be called from the probe routine, and a lot of the code that isn't relevant to PCI setup moved away from the .init() callback. This also removes the dependency on the device match data, which is an oddity. Signed-off-by: Marc Zyngier [mani: fixed spelling mistakes] Signed-off-by: Manivannan Sadhasivam Tested-by: Janne Grunau Reviewed-by: Rob Herring (Arm) Reviewed-by: Manivannan Sadhasivam Acked-by: Alyssa Rosenzweig Link: https://patch.msgid.link/20250401091713.2765724-4-maz@kernel.org --- include/linux/pci-ecam.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 3a10f8cfc3ad..bc2ca2c72ee2 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -97,6 +97,8 @@ extern const struct pci_ecam_ops loongson_pci_ecam_ops; /* Loongson PCIe */ #if IS_ENABLED(CONFIG_PCI_HOST_COMMON) /* for DT-based PCI controllers that support ECAM */ int pci_host_common_probe(struct platform_device *pdev); +int pci_host_common_init(struct platform_device *pdev, + const struct pci_ecam_ops *ops); void pci_host_common_remove(struct platform_device *pdev); #endif #endif -- cgit v1.2.3 From 9861f21ff16b6cd919144dae7ff355d5145a3474 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Fri, 14 Mar 2025 11:00:55 +0100 Subject: pmdomain: core: Add genpd helper to correct the usage/rejected counters In the cpuidle-psci-domain case the ->power_off() callback is usually returning zero to indicate success. This is because the actual call to the PSCI FW to enter the selected domain-idlestate, needs to be done after the ->power_off() callback has returned. When the call to the PSCI FW fails, this leads to receiving an incorrect tracking of the usage/rejected counts for the selected domain-idlestate. In other words, the presented debug-statistics for genpd may look better than what the actually are. To allow a better correctness of the data, let's add a new genpd helper function, which enables the caller adjust the usage/rejected counters for a domain-idlestate, in cases of errors during power-off. Signed-off-by: Ulf Hansson Link: https://lore.kernel.org/r/20250314100103.1294715-2-ulf.hansson@linaro.org --- include/linux/pm_domain.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index d56a78af4af1..6e808aeecbcb 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -285,6 +285,8 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, int pm_genpd_init(struct generic_pm_domain *genpd, struct dev_power_governor *gov, bool is_off); int pm_genpd_remove(struct generic_pm_domain *genpd); +void pm_genpd_inc_rejected(struct generic_pm_domain *genpd, + unsigned int state_idx); struct device *dev_to_genpd_dev(struct device *dev); int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state); int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb); @@ -336,6 +338,10 @@ static inline int pm_genpd_remove(struct generic_pm_domain *genpd) return -EOPNOTSUPP; } +static inline void pm_genpd_inc_rejected(struct generic_pm_domain *genpd, + unsigned int state_idx) +{ } + static inline struct device *dev_to_genpd_dev(struct device *dev) { return ERR_PTR(-EOPNOTSUPP); -- cgit v1.2.3 From 0a8a888167ddaaec7a292e5045782b8a240e6f3e Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Fri, 14 Mar 2025 11:00:58 +0100 Subject: pmdomain: core: Add residency reflection for domain-idlestates to debugfs For regular cpuidle states we are reflecting over the selected/entered state to see if the sleep-duration meets the residency for the state. The output from the reflection is an "above" value to indicate the number of times the state was too deep and a "below" value for the number of times it was too shallow. Let's implement the similar thing for genpd's domain-idlestates along with genpd's governor and put the information in the genpd's debugfs. Signed-off-by: Ulf Hansson Link: https://lore.kernel.org/r/20250314100103.1294715-5-ulf.hansson@linaro.org --- include/linux/pm_domain.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 6e808aeecbcb..0b18160901a2 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -142,6 +142,8 @@ struct genpd_governor_data { bool max_off_time_changed; ktime_t next_wakeup; ktime_t next_hrtimer; + ktime_t last_enter; + bool reflect_residency; bool cached_power_down_ok; bool cached_power_down_state_idx; }; @@ -153,6 +155,8 @@ struct genpd_power_state { s64 residency_ns; u64 usage; u64 rejected; + u64 above; + u64 below; struct fwnode_handle *fwnode; u64 idle_time; void *data; -- cgit v1.2.3 From 76a853f86c97b348dc96e75a6e6f94d8750715ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Mar 2025 13:49:39 +0100 Subject: wifi: free SKBTX_WIFI_STATUS skb tx_flags flag Jason mentioned at netdevconf that we've run out of tx_flags in the skb_shinfo(). Gain one bit back by removing the wifi bit. We can do that because the only userspace application for it (hostapd) doesn't change the setting on the socket, it just uses different sockets, and normally doesn't even use this any more, sending the frames over nl80211 instead. Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250313134942.52ff54a140ec.If390bbdc46904cf451256ba989d7a056c457af6e@changeid Signed-off-by: Johannes Berg --- include/linux/skbuff.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b974a277975a..9ee39670e8f4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -481,9 +481,6 @@ enum { /* generate software time stamp on packet tx completion */ SKBTX_COMPLETION_TSTAMP = 1 << 3, - /* generate wifi status information (where possible) */ - SKBTX_WIFI_STATUS = 1 << 4, - /* determine hardware time stamp based on time or cycles */ SKBTX_HW_TSTAMP_NETDEV = 1 << 5, -- cgit v1.2.3 From 76f1cc98b23cefd1f0ae90c51f1fb837e5f46528 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Apr 2025 10:31:20 +0100 Subject: io_uring/zcrx: add support for multiple ifqs Allow the user to register multiple ifqs / zcrx contexts. With that we can use multiple interfaces / interface queues in a single io_uring instance. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/668b03bee03b5216564482edcfefbc2ee337dd30.1745141261.git.asml.silence@gmail.com [axboe: fold in fix] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 06d722289fc5..7e23e993280e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,8 +40,6 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; -struct io_zcrx_ifq; - struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -394,7 +392,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; - struct io_zcrx_ifq *ifq; + /* Stores zcrx object pointers of type struct io_zcrx_ifq */ + struct xarray zcrx_ctxs; u32 pers_next; struct xarray personalities; -- cgit v1.2.3 From fcc2d3e11bcc8f01d52a8c419f49f86ff8343b7c Mon Sep 17 00:00:00 2001 From: Karthikeyan Kathirvel Date: Mon, 21 Apr 2025 16:45:05 +0530 Subject: wifi: ieee80211: define beacon protection bit field An AP supporting Beacon Protection should set bit 84 in the extended capabilities IE (9.4.2.25 in the 802.11be D7 spec). So the *4th* bit of the 10th byte should be checked to figure out whether beacon protection is enabled or disabled. Signed-off-by: Karthikeyan Kathirvel Reviewed-by: Jeff Johnson Link: https://patch.msgid.link/20250421111505.3633992-1-karthikeyan.kathirvel@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 508d466de1cc..cbc3928aa504 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4087,6 +4087,9 @@ enum ieee80211_tdls_actioncode { /* Defines support for enhanced multi-bssid advertisement*/ #define WLAN_EXT_CAPA11_EMA_SUPPORT BIT(3) +/* Enable Beacon Protection */ +#define WLAN_EXT_CAPA11_BCN_PROTECT BIT(4) + /* TDLS specific payload type in the LLC/SNAP header */ #define WLAN_TDLS_SNAP_RFTYPE 0x2 -- cgit v1.2.3 From 91ea0489dc97bfda72ed74f98ab66dc0ab4235c7 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Thu, 27 Mar 2025 10:43:19 +0530 Subject: wifi: ieee80211: Add helpers to fetch EMLSR delay and timeout values Add helpers to get EMLSR transition delay, padding delay and transition timeout values from EML capabilities field of Multi-link Element. Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20250327051320.3253783-4-quic_ramess@quicinc.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index cbc3928aa504..15a87f522017 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -5618,6 +5618,80 @@ static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len) return len >= fixed + elem_len; } +/** + * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay + * in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Padding delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR + * Padding Delay subfield. + */ + u32 pad_delay = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_PADDING_DELAY); + + if (!pad_delay || + pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US) + return 0; + + return 32 * (1 << (pad_delay - 1)); +} + +/** + * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition + * delay in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR + * Transition Delay subfield. + */ + u32 trans_delay = + u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY); + + /* invalid values also just use 0 */ + if (!trans_delay || + trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US) + return 0; + + return 16 * (1 << (trans_delay - 1)); +} + +/** + * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition + * timeout value in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition timeout (in microseconds) encoded in + * the EML Capabilities field + */ + +static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the + * Transition Timeout subfield. + */ + u8 timeout = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_TRANSITION_TIMEOUT); + + /* invalid values also just use 0 */ + if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU) + return 0; + + return 128 * (1 << (timeout - 1)); +} + #define for_each_mle_subelement(_elem, _data, _len) \ if (ieee80211_mle_size_ok(_data, _len)) \ for_each_element(_elem, \ -- cgit v1.2.3 From bfa4477751e9909bb121eca860f44d1b4259d871 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 22 Apr 2025 17:05:31 -0600 Subject: PM: runtime: Define pm_runtime_put cleanup helper Define a cleanup helper for use with __free to automatically drop the device usage count when the pointer goes out of scope. Signed-off-by: Alex Williamson Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki Link: https://patch.msgid.link/20250422230534.2295291-2-alex.williamson@redhat.com --- include/linux/pm_runtime.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 7fb5a459847e..69d4b2929ee6 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -466,6 +466,8 @@ static inline int pm_runtime_put(struct device *dev) return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } +DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) + /** * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. -- cgit v1.2.3 From 52358dd63e348c3b6c488acc105be1aeda8fb923 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 18 Apr 2025 11:04:01 +0200 Subject: net: phy: remove function stubs All callers of these functions depend on PHYLIB or select it directly or indirectly by selecting PHYLINK. Stubs make sense for optional functionality, but that's not the case here. MDIO_XGENE usually is selected by NET_XGENE which also selects PHYLIB. Add a dependency to PHYLIB nevertheless, in order not to break randconfig builds. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/f7a69a1f-60e9-4ac0-8b7c-481e0cc850e7@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 066a28a4b64b..3beaf225ee88 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1753,7 +1753,6 @@ int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); -#if IS_ENABLED(CONFIG_PHYLIB) int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); @@ -1761,42 +1760,6 @@ struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); void phy_device_free(struct phy_device *phydev); -#else -static inline int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id) -{ - return 0; -} -static inline -struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode) -{ - return 0; -} - -static inline -struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) -{ - return NULL; -} - -static inline -struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) -{ - return NULL; -} - -static inline -struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45) -{ - return NULL; -} - -static inline int phy_device_register(struct phy_device *phy) -{ - return 0; -} - -static inline void phy_device_free(struct phy_device *phydev) { } -#endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); int phy_get_c45_ids(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); -- cgit v1.2.3 From fe7f7ac8e0c708446ff017453add769ffc15deed Mon Sep 17 00:00:00 2001 From: Terry Junge Date: Wed, 12 Mar 2025 15:23:31 -0700 Subject: HID: usbhid: Eliminate recurrent out-of-bounds bug in usbhid_parse() Update struct hid_descriptor to better reflect the mandatory and optional parts of the HID Descriptor as per USB HID 1.11 specification. Note: the kernel currently does not parse any optional HID class descriptors, only the mandatory report descriptor. Update all references to member element desc[0] to rpt_desc. Add test to verify bLength and bNumDescriptors values are valid. Replace the for loop with direct access to the mandatory HID class descriptor member for the report descriptor. This eliminates the possibility of getting an out-of-bounds fault. Add a warning message if the HID descriptor contains any unsupported optional HID class descriptors. Reported-by: syzbot+c52569baf0c843f35495@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c52569baf0c843f35495 Fixes: f043bfc98c19 ("HID: usbhid: fix out-of-bounds bug") Cc: stable@vger.kernel.org Signed-off-by: Terry Junge Reviewed-by: Michael Kelley Signed-off-by: Jiri Kosina --- include/linux/hid.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index ef9a90ca0fbd..daae1d6d11a7 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -740,8 +740,9 @@ struct hid_descriptor { __le16 bcdHID; __u8 bCountryCode; __u8 bNumDescriptors; + struct hid_class_descriptor rpt_desc; - struct hid_class_descriptor desc[1]; + struct hid_class_descriptor opt_descs[]; } __attribute__ ((packed)); #define HID_DEVICE(b, g, ven, prod) \ -- cgit v1.2.3 From a058002358b7aaf14674b2a0daa5194bfa5720a1 Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Thu, 10 Apr 2025 12:59:27 +0530 Subject: HID: quirks: Add HID_QUIRK_IGNORE_MOUSE quirk Some USB HID mice have drivers both in HID as well as a separate USB driver. The already existing hid_mouse_ignore_list in hid-quirks manages this, but is not yet configurable by usbhid.quirks, unlike all others like hid_ignore_list. Thus in some HID devices, where the vendor provides USB drivers only for the mouse and lets keyboard handled by the generic hid drivers, presence of such a quirk prevents the user from compiling hid core again to add the device to the table. Signed-off-by: Aditya Garg Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index daae1d6d11a7..a1305210b2fd 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -357,6 +357,7 @@ struct hid_item { * | @HID_QUIRK_INPUT_PER_APP: * | @HID_QUIRK_X_INVERT: * | @HID_QUIRK_Y_INVERT: + * | @HID_QUIRK_IGNORE_MOUSE: * | @HID_QUIRK_SKIP_OUTPUT_REPORTS: * | @HID_QUIRK_SKIP_OUTPUT_REPORT_ID: * | @HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP: @@ -382,6 +383,7 @@ struct hid_item { #define HID_QUIRK_INPUT_PER_APP BIT(11) #define HID_QUIRK_X_INVERT BIT(12) #define HID_QUIRK_Y_INVERT BIT(13) +#define HID_QUIRK_IGNORE_MOUSE BIT(14) #define HID_QUIRK_SKIP_OUTPUT_REPORTS BIT(16) #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID BIT(17) #define HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP BIT(18) -- cgit v1.2.3 From 7a3be00771aa9786c7bb4cdb0ee36fee45f67d69 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Apr 2025 15:48:40 +0530 Subject: OPP: Return opp from dev_pm_opp_get() For convenience of users, return back the pointer to the opp from dev_pm_opp_get(), so they can do: opp = dev_pm_opp_get(tmp_opp); No intentional functional impact. Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index c247317aae38..5e4c3428b139 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -161,7 +161,7 @@ struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev, struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev, unsigned int *bw, int index); -void dev_pm_opp_get(struct dev_pm_opp *opp); +struct dev_pm_opp *dev_pm_opp_get(struct dev_pm_opp *opp); void dev_pm_opp_put(struct dev_pm_opp *opp); int dev_pm_opp_add_dynamic(struct device *dev, struct dev_pm_opp_data *opp); @@ -345,7 +345,10 @@ static inline struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev, return ERR_PTR(-EOPNOTSUPP); } -static inline void dev_pm_opp_get(struct dev_pm_opp *opp) {} +static inline struct dev_pm_opp *dev_pm_opp_get(struct dev_pm_opp *opp) +{ + return opp; +} static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {} -- cgit v1.2.3 From ead694941686345bfd3f95100d889191cb9e3cda Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Apr 2025 15:48:40 +0530 Subject: OPP: Return opp_table from dev_pm_opp_get_opp_table_ref() For convenience of users, return back the pointer to the opp_table from dev_pm_opp_get_opp_table_ref(), so they can do: opp_table = dev_pm_opp_get_opp_table_ref(tmp_table); No intentional functional impact. Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 5e4c3428b139..0deddfa91aca 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -100,7 +100,7 @@ struct dev_pm_opp_data { #if defined(CONFIG_PM_OPP) struct opp_table *dev_pm_opp_get_opp_table(struct device *dev); -void dev_pm_opp_get_opp_table_ref(struct opp_table *opp_table); +struct opp_table *dev_pm_opp_get_opp_table_ref(struct opp_table *opp_table); void dev_pm_opp_put_opp_table(struct opp_table *opp_table); unsigned long dev_pm_opp_get_bw(struct dev_pm_opp *opp, bool peak, int index); @@ -207,7 +207,10 @@ static inline struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device * return ERR_PTR(-EOPNOTSUPP); } -static inline void dev_pm_opp_get_opp_table_ref(struct opp_table *opp_table) {} +static inline struct opp_table *dev_pm_opp_get_opp_table_ref(struct opp_table *opp_table) +{ + return opp_table; +} static inline void dev_pm_opp_put_opp_table(struct opp_table *opp_table) {} -- cgit v1.2.3 From 0128816c42b52c6ee339718621aeda85855cd3be Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Thu, 10 Apr 2025 18:51:43 +0800 Subject: genirq: Fix typo in IRQ_NOTCONNECTED comment Fix a minor typo in the comment for IRQ_NOTCONNECTED: "distingiush" is corrected to "distinguish". Signed-off-by: Cheng-Yang Chou Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250410105144.214849-1-yphbchou0911@gmail.com --- include/linux/interrupt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c782a74d2a30..51b6484c0493 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -140,7 +140,7 @@ extern irqreturn_t no_action(int cpl, void *dev_id); /* * If a (PCI) device interrupt is not connected we set dev->irq to * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we - * can distingiush that case from other error returns. + * can distinguish that case from other error returns. * * 0x80000000 is guaranteed to be outside the available range of interrupts * and easy to distinguish from other possible incorrect values. -- cgit v1.2.3 From 49916e22d9530d6cf027e635a5d824c7d698d67f Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 18 Apr 2025 21:08:03 +0100 Subject: timers: Remove unused __round_jiffies(_up) Remove two trivial but long unused functions. __round_jiffies() has been unused since 2008's commit 9c133c469d38 ("Add round_jiffies_up and related routines") __round_jiffies_up() has been unused since 2019's commit 7ae3f6e130e8 ("powerpc/watchdog: Use hrtimers for per-CPU heartbeat") Remove them. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250418200803.427911-1-linux@treblig.org --- include/linux/timer.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 10596d7c3a34..e17aac74b5b3 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -172,12 +172,10 @@ extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); -unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); unsigned long round_jiffies_relative(unsigned long j); -unsigned long __round_jiffies_up(unsigned long j, int cpu); unsigned long __round_jiffies_up_relative(unsigned long j, int cpu); unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); -- cgit v1.2.3 From 4094040b1a133206ed893da2cf5e17cc22f7ca7c Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Tue, 8 Apr 2025 11:45:59 +0300 Subject: mfd: rohm-bd96801: Support ROHM BD96802 The ROHM BD96802 PMIC looks from software point of view a lot like ROHM BD96801 PMIC. Just with reduced number of voltage rails. Both PMICs provide two physical IRQ lines referred as INTB and ERRB and contain blocks implementing regulator controls and a weatchdog. Hence it makes sense to use same MFD core for both PMICs. Add support for ROHM BD96802 scalable companion PMIC to the BD96801 core driver. Signed-off-by: Matti Vaittinen Acked-by: Conor Dooley Link: https://lore.kernel.org/r/05957d194425a79a4f35f287695c3d9ca2ed1ae2.1744090658.git.mazziesaccount@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/rohm-bd96801.h | 2 ++ include/linux/mfd/rohm-bd96802.h | 74 ++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/rohm-generic.h | 1 + 3 files changed, 77 insertions(+) create mode 100644 include/linux/mfd/rohm-bd96802.h (limited to 'include/linux') diff --git a/include/linux/mfd/rohm-bd96801.h b/include/linux/mfd/rohm-bd96801.h index e2d9e10b6364..68c8ac8ad409 100644 --- a/include/linux/mfd/rohm-bd96801.h +++ b/include/linux/mfd/rohm-bd96801.h @@ -40,7 +40,9 @@ * INTB status registers are at range 0x5c ... 0x63 */ #define BD96801_REG_INT_SYS_ERRB1 0x52 +#define BD96801_REG_INT_BUCK2_ERRB 0x56 #define BD96801_REG_INT_SYS_INTB 0x5c +#define BD96801_REG_INT_BUCK2_INTB 0x5e #define BD96801_REG_INT_LDO7_INTB 0x63 /* MASK registers */ diff --git a/include/linux/mfd/rohm-bd96802.h b/include/linux/mfd/rohm-bd96802.h new file mode 100644 index 000000000000..bf4b77944edf --- /dev/null +++ b/include/linux/mfd/rohm-bd96802.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2025 ROHM Semiconductors + * + * The digital interface of trhe BD96802 PMIC is a reduced version of the + * BD96801. Hence the BD96801 definitions are used for registers and masks + * while this header only holds the IRQ definitions - mainly to avoid gaps in + * IRQ numbers caused by the lack of some BUCKs / LDOs and their respective + * IRQs. + */ + +#ifndef __LINUX_MFD_BD96802_H__ +#define __LINUX_MFD_BD96802_H__ + +/* ERRB IRQs */ +enum { + /* Reg 0x52, 0x53, 0x54 - ERRB system IRQs */ + BD96802_OTP_ERR_STAT, + BD96802_DBIST_ERR_STAT, + BD96802_EEP_ERR_STAT, + BD96802_ABIST_ERR_STAT, + BD96802_PRSTB_ERR_STAT, + BD96802_DRMOS1_ERR_STAT, + BD96802_DRMOS2_ERR_STAT, + BD96802_SLAVE_ERR_STAT, + BD96802_VREF_ERR_STAT, + BD96802_TSD_ERR_STAT, + BD96802_UVLO_ERR_STAT, + BD96802_OVLO_ERR_STAT, + BD96802_OSC_ERR_STAT, + BD96802_PON_ERR_STAT, + BD96802_POFF_ERR_STAT, + BD96802_CMD_SHDN_ERR_STAT, + BD96802_INT_SHDN_ERR_STAT, + + /* Reg 0x55 BUCK1 ERR IRQs */ + BD96802_BUCK1_PVIN_ERR_STAT, + BD96802_BUCK1_OVP_ERR_STAT, + BD96802_BUCK1_UVP_ERR_STAT, + BD96802_BUCK1_SHDN_ERR_STAT, + + /* Reg 0x56 BUCK2 ERR IRQs */ + BD96802_BUCK2_PVIN_ERR_STAT, + BD96802_BUCK2_OVP_ERR_STAT, + BD96802_BUCK2_UVP_ERR_STAT, + BD96802_BUCK2_SHDN_ERR_STAT, +}; + +/* INTB IRQs */ +enum { + /* Reg 0x5c (System INTB) */ + BD96802_TW_STAT, + BD96802_WDT_ERR_STAT, + BD96802_I2C_ERR_STAT, + BD96802_CHIP_IF_ERR_STAT, + + /* Reg 0x5d (BUCK1 INTB) */ + BD96802_BUCK1_OCPH_STAT, + BD96802_BUCK1_OCPL_STAT, + BD96802_BUCK1_OCPN_STAT, + BD96802_BUCK1_OVD_STAT, + BD96802_BUCK1_UVD_STAT, + BD96802_BUCK1_TW_CH_STAT, + + /* Reg 0x5e (BUCK2 INTB) */ + BD96802_BUCK2_OCPH_STAT, + BD96802_BUCK2_OCPL_STAT, + BD96802_BUCK2_OCPN_STAT, + BD96802_BUCK2_OVD_STAT, + BD96802_BUCK2_UVD_STAT, + BD96802_BUCK2_TW_CH_STAT, +}; + +#endif diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h index e7d4e6afe388..11b86f9129e3 100644 --- a/include/linux/mfd/rohm-generic.h +++ b/include/linux/mfd/rohm-generic.h @@ -17,6 +17,7 @@ enum rohm_chip_type { ROHM_CHIP_TYPE_BD71837, ROHM_CHIP_TYPE_BD71847, ROHM_CHIP_TYPE_BD96801, + ROHM_CHIP_TYPE_BD96802, ROHM_CHIP_TYPE_AMOUNT }; -- cgit v1.2.3 From 6a309b489215f705c20cb4ed7f85d9c16f768e55 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Tue, 8 Apr 2025 11:46:36 +0300 Subject: mfd: bd96801: Support ROHM BD96805 The ROHM BD96805 is from the software perspective almost identical to the ROHM BD96801. The main difference is different voltage tuning ranges. Add support differentiating these PMICs based on the compatible, and invoking the regulator driver with correct IC type. Signed-off-by: Matti Vaittinen Acked-by: Conor Dooley Link: https://lore.kernel.org/r/8680097dc083f191bea56d3ac7c6fe5c005644ec.1744090658.git.mazziesaccount@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/rohm-generic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h index 11b86f9129e3..867acf5baefc 100644 --- a/include/linux/mfd/rohm-generic.h +++ b/include/linux/mfd/rohm-generic.h @@ -18,6 +18,7 @@ enum rohm_chip_type { ROHM_CHIP_TYPE_BD71847, ROHM_CHIP_TYPE_BD96801, ROHM_CHIP_TYPE_BD96802, + ROHM_CHIP_TYPE_BD96805, ROHM_CHIP_TYPE_AMOUNT }; -- cgit v1.2.3 From fecc18a9f59ce9c36eb3622ae77bff5fa5c6d976 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Tue, 8 Apr 2025 11:47:10 +0300 Subject: mfd: bd96801: Support ROHM BD96806 The ROHM BD96806 is from the software perspective almost identical to the ROHM BD96802. The main difference is different voltage tuning ranges. Add support differentiating these PMICs based on the compatible, and invoking the regulator driver with correct IC type. Signed-off-by: Matti Vaittinen Acked-by: Conor Dooley Link: https://lore.kernel.org/r/ccc95ae33613648fdcba08915777d945412ac5c4.1744090658.git.mazziesaccount@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/rohm-generic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h index 867acf5baefc..579e8dcfcca4 100644 --- a/include/linux/mfd/rohm-generic.h +++ b/include/linux/mfd/rohm-generic.h @@ -19,6 +19,7 @@ enum rohm_chip_type { ROHM_CHIP_TYPE_BD96801, ROHM_CHIP_TYPE_BD96802, ROHM_CHIP_TYPE_BD96805, + ROHM_CHIP_TYPE_BD96806, ROHM_CHIP_TYPE_AMOUNT }; -- cgit v1.2.3 From 7f8ce4d88b42fcbd3350370ec4d02e00979fc5a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 17 Apr 2025 20:16:11 +0200 Subject: pwm: Fix various formatting issues in kernel-doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Return and (where interesting) Context sections, fix some formatting and drop documenting the internal function __pwm_apply(). Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20250417181611.2693599-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index bf0469b2201d..63a17d2b4ec8 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -218,6 +218,8 @@ static inline void pwm_init_state(const struct pwm_device *pwm, * * pwm_get_state(pwm, &state); * duty = pwm_get_relative_duty_cycle(&state, 100); + * + * Returns: rounded relative duty cycle multiplied by @scale */ static inline unsigned int pwm_get_relative_duty_cycle(const struct pwm_state *state, unsigned int scale) @@ -244,8 +246,8 @@ pwm_get_relative_duty_cycle(const struct pwm_state *state, unsigned int scale) * pwm_set_relative_duty_cycle(&state, 50, 100); * pwm_apply_might_sleep(pwm, &state); * - * This functions returns -EINVAL if @duty_cycle and/or @scale are - * inconsistent (@scale == 0 or @duty_cycle > @scale). + * Returns: 0 on success or ``-EINVAL`` if @duty_cycle and/or @scale are + * inconsistent (@scale == 0 or @duty_cycle > @scale) */ static inline int pwm_set_relative_duty_cycle(struct pwm_state *state, unsigned int duty_cycle, @@ -351,7 +353,7 @@ struct pwm_chip { * pwmchip_supports_waveform() - checks if the given chip supports waveform callbacks * @chip: The pwm_chip to test * - * Returns true iff the pwm chip support the waveform functions like + * Returns: true iff the pwm chip support the waveform functions like * pwm_set_waveform_might_sleep() and pwm_round_waveform_might_sleep() */ static inline bool pwmchip_supports_waveform(struct pwm_chip *chip) -- cgit v1.2.3 From 309d28576f0a23931a36bf67324868448f79a77e Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 27 Mar 2025 12:39:56 -0500 Subject: KVM: SVM: Fix SNP AP destroy race with VMRUN An AP destroy request for a target vCPU is typically followed by an RMPADJUST to remove the VMSA attribute from the page currently being used as the VMSA for the target vCPU. This can result in a vCPU that is about to VMRUN to exit with #VMEXIT_INVALID. This usually does not happen as APs are typically sitting in HLT when being destroyed and therefore the vCPU thread is not running at the time. However, if HLT is allowed inside the VM, then the vCPU could be about to VMRUN when the VMSA attribute is removed from the VMSA page, resulting in a #VMEXIT_INVALID when the vCPU actually issues the VMRUN and causing the guest to crash. An RMPADJUST against an in-use (already running) VMSA results in a #NPF for the vCPU issuing the RMPADJUST, so the VMSA attribute cannot be changed until the VMRUN for target vCPU exits. The Qemu command line option '-overcommit cpu-pm=on' is an example of allowing HLT inside the guest. Update the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event to include the KVM_REQUEST_WAIT flag. The kvm_vcpu_kick() function will not wait for requests to be honored, so create kvm_make_request_and_kick() that will add a new event request and honor the KVM_REQUEST_WAIT flag. This will ensure that the target vCPU sees the AP destroy request before returning to the initiating vCPU should the target vCPU be in guest mode. Fixes: e366f92ea99e ("KVM: SEV: Support SEV-SNP AP Creation NAE event") Signed-off-by: Tom Lendacky Link: https://lore.kernel.org/r/fe2c885bf35643dd224e91294edb6777d5df23a4.1743097196.git.thomas.lendacky@amd.com [sean: add a comment explaining the use of smp_send_reschedule()] Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1dedc421b3e3..c685fb417e92 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1505,7 +1505,16 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); -void kvm_vcpu_kick(struct kvm_vcpu *vcpu); + +#ifndef CONFIG_S390 +void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait); + +static inline void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + __kvm_vcpu_kick(vcpu, false); +} +#endif + int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode); @@ -2253,6 +2262,14 @@ static __always_inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) __kvm_make_request(req, vcpu); } +#ifndef CONFIG_S390 +static inline void kvm_make_request_and_kick(int req, struct kvm_vcpu *vcpu) +{ + kvm_make_request(req, vcpu); + __kvm_vcpu_kick(vcpu, req & KVM_REQUEST_WAIT); +} +#endif + static inline bool kvm_request_pending(struct kvm_vcpu *vcpu) { return READ_ONCE(vcpu->requests); -- cgit v1.2.3 From bc2550b4e195754fbb24aac1f012d3dd9e3b4edc Mon Sep 17 00:00:00 2001 From: Jeremy Harris Date: Wed, 23 Apr 2025 13:43:33 +0100 Subject: tcp: fastopen: note that a child socket was created tcp: fastopen: note that a child socket was created This uses up the last bit in a field of tcp_sock. Signed-off-by: Jeremy Harris Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250423124334.4916-2-jgh@exim.org Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1669d95bb0f9..a8af71623ba7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -385,7 +385,8 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ + syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ -- cgit v1.2.3 From d57ee99831e336576359beb26e2b140511c99106 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Apr 2025 17:08:08 +0200 Subject: net: ethernet: mtk_wed: annotate RCU release in attach() There are some sparse warnings in wifi, and it seems that it's actually possible to annotate a function pointer with __releases(), making the sparse warnings go away. In a way that also serves as documentation that rcu_read_unlock() must be called in the attach method, so add that annotation. Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20250423150811.456205-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- include/linux/soc/mediatek/mtk_wed.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index a476648858a6..d8949a4ed0dc 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -192,7 +192,7 @@ struct mtk_wed_device { }; struct mtk_wed_ops { - int (*attach)(struct mtk_wed_device *dev); + int (*attach)(struct mtk_wed_device *dev) __releases(RCU); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, -- cgit v1.2.3 From eae324ca644554d5ce363186bee820a088bb74ab Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 20 Apr 2025 12:47:25 +0200 Subject: configfs: Add CONFIGFS_ATTR_PERM helper This new helper allows creating rw files with custom permissions. Signed-off-by: Richard Weinberger Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20250420104726.2963750-1-richard@nod.at Signed-off-by: Andreas Hindborg --- include/linux/configfs.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index c771e9d0d0b9..698520b1bfdb 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -120,15 +120,19 @@ struct configfs_attribute { ssize_t (*store)(struct config_item *, const char *, size_t); }; -#define CONFIGFS_ATTR(_pfx, _name) \ +#define CONFIGFS_ATTR_PERM(_pfx, _name, _perm) \ static struct configfs_attribute _pfx##attr_##_name = { \ .ca_name = __stringify(_name), \ - .ca_mode = S_IRUGO | S_IWUSR, \ + .ca_mode = _perm, \ .ca_owner = THIS_MODULE, \ .show = _pfx##_name##_show, \ .store = _pfx##_name##_store, \ } +#define CONFIGFS_ATTR(_pfx, _name) CONFIGFS_ATTR_PERM( \ + _pfx, _name, S_IRUGO | S_IWUSR \ +) + #define CONFIGFS_ATTR_RO(_pfx, _name) \ static struct configfs_attribute _pfx##attr_##_name = { \ .ca_name = __stringify(_name), \ -- cgit v1.2.3 From be4e3097c1f800b0f39e7e60b2b28eb6603f5d06 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Wed, 23 Apr 2025 22:40:56 +0800 Subject: tty: Remove unused API tty_port_register_device_serdev() Remove API tty_port_register_device_serdev() which has no caller. Signed-off-by: Zijun Hu Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20250423-remove_api-v1-1-fac673d09feb@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_port.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 1b861f2100b6..08f89a598366 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -147,9 +147,6 @@ struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); -struct device *tty_port_register_device_serdev(struct tty_port *port, - struct tty_driver *driver, unsigned index, - struct device *host, struct device *parent); struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent, void *drvdata, -- cgit v1.2.3 From 1404d3509c768732be51d0acf8330689936a692a Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Fri, 25 Apr 2025 13:13:12 +0200 Subject: serial: switch uart_port::iotype to enum uart_iotype The inline-defined constants look weird. Instead, define a proper enum for them and type uart_port::iotype as that enum. This allows for proper checking in switch-case labels (somewhere, a default or UPIO_UNKNOWN label needs to be added/handled). Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20250425111315.1036184-4-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 743b4afaad4c..914b5e97e056 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -427,6 +427,18 @@ struct uart_icount { typedef u64 __bitwise upf_t; typedef unsigned int __bitwise upstat_t; +enum uart_iotype { + UPIO_UNKNOWN = -1, + UPIO_PORT = SERIAL_IO_PORT, /* 8b I/O port access */ + UPIO_HUB6 = SERIAL_IO_HUB6, /* Hub6 ISA card */ + UPIO_MEM = SERIAL_IO_MEM, /* driver-specific */ + UPIO_MEM32 = SERIAL_IO_MEM32, /* 32b little endian */ + UPIO_AU = SERIAL_IO_AU, /* Au1x00 and RT288x type IO */ + UPIO_TSI = SERIAL_IO_TSI, /* Tsi108/109 type IO */ + UPIO_MEM32BE = SERIAL_IO_MEM32BE, /* 32b big endian */ + UPIO_MEM16 = SERIAL_IO_MEM16, /* 16b little endian */ +}; + struct uart_port { spinlock_t lock; /* port lock */ unsigned long iobase; /* in/out[bwl] */ @@ -469,23 +481,13 @@ struct uart_port { unsigned char x_char; /* xon/xoff char */ unsigned char regshift; /* reg offset shift */ - unsigned char iotype; /* io access style */ - -#define UPIO_UNKNOWN ((unsigned char)~0U) /* UCHAR_MAX */ -#define UPIO_PORT (SERIAL_IO_PORT) /* 8b I/O port access */ -#define UPIO_HUB6 (SERIAL_IO_HUB6) /* Hub6 ISA card */ -#define UPIO_MEM (SERIAL_IO_MEM) /* driver-specific */ -#define UPIO_MEM32 (SERIAL_IO_MEM32) /* 32b little endian */ -#define UPIO_AU (SERIAL_IO_AU) /* Au1x00 and RT288x type IO */ -#define UPIO_TSI (SERIAL_IO_TSI) /* Tsi108/109 type IO */ -#define UPIO_MEM32BE (SERIAL_IO_MEM32BE) /* 32b big endian */ -#define UPIO_MEM16 (SERIAL_IO_MEM16) /* 16b little endian */ - unsigned char quirks; /* internal quirks */ /* internal quirks must be updated while holding port mutex */ #define UPQ_NO_TXEN_TEST BIT(0) + enum uart_iotype iotype; /* io access style */ + unsigned int read_status_mask; /* driver specific */ unsigned int ignore_status_mask; /* driver specific */ struct uart_state *state; /* pointer to parent state */ @@ -1101,8 +1103,8 @@ static inline bool uart_console_registered(struct uart_port *port) struct uart_port *uart_get_console(struct uart_port *ports, int nr, struct console *c); -int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr, - char **options); +int uart_parse_earlycon(char *p, enum uart_iotype *iotype, + resource_size_t *addr, char **options); void uart_parse_options(const char *options, int *baud, int *parity, int *bits, int *flow); int uart_set_options(struct uart_port *port, struct console *co, int baud, -- cgit v1.2.3 From 5117f28a7d78d00a44d03463115a0f295dbbb1ff Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Tue, 15 Apr 2025 12:35:58 +0100 Subject: comedi: remove the mapping of the Comedi buffer in vmalloc address space Now that all the code that accesses the Comedi buffer data does so page-by-page, using the `virt_addr` member of `struct comedi_buf_page` to point to the data of each page, do not linearly map the buffer into vmalloc address space (pointed to by the `prealloc_buf` member of `struct comedi_async`). That was only done for convenience, but was not done for those drivers that need a DMA coherent buffer, which is allocated in a single chunk. Remove the `prealloc_buf` member as it is no longer used. Signed-off-by: Ian Abbott Link: https://lore.kernel.org/r/20250415114008.5977-4-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/comedi/comedidev.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/comedi/comedidev.h b/include/linux/comedi/comedidev.h index c08416a7364b..4cb0400ad616 100644 --- a/include/linux/comedi/comedidev.h +++ b/include/linux/comedi/comedidev.h @@ -234,16 +234,12 @@ struct comedi_buf_page { * * A COMEDI data buffer is allocated as individual pages, either in * conventional memory or DMA coherent memory, depending on the attached, - * low-level hardware device. (The buffer pages also get mapped into the - * kernel's contiguous virtual address space pointed to by the 'prealloc_buf' - * member of &struct comedi_async.) + * low-level hardware device. * * The buffer is normally freed when the COMEDI device is detached from the * low-level driver (which may happen due to device removal), but if it happens * to be mmapped at the time, the pages cannot be freed until the buffer has - * been munmapped. That is what the reference counter is for. (The virtual - * address space pointed by 'prealloc_buf' is freed when the COMEDI device is - * detached.) + * been munmapped. That is what the reference counter is for. */ struct comedi_buf_map { struct device *dma_hw_dev; @@ -255,7 +251,6 @@ struct comedi_buf_map { /** * struct comedi_async - Control data for asynchronous COMEDI commands - * @prealloc_buf: Kernel virtual address of allocated acquisition buffer. * @prealloc_bufsz: Buffer size (in bytes). * @buf_map: Map of buffer pages. * @max_bufsize: Maximum allowed buffer size (in bytes). @@ -344,7 +339,6 @@ struct comedi_buf_map { * less than or equal to UINT_MAX). */ struct comedi_async { - void *prealloc_buf; unsigned int prealloc_bufsz; struct comedi_buf_map *buf_map; unsigned int max_bufsize; -- cgit v1.2.3 From dbc8c84d5c2e200c79393c17896bdc65e6daef30 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 20 Apr 2025 15:57:39 +0100 Subject: misc: rtsx: Remove deadcode The last uses of rtsx_ms_power_off_card3v3() and rtsx_sd_power_off_card3v3() were removed by 2019's commit bede03a579b3 ("misc: rtsx: Enable OCP for rts522a rts524a rts525a rts5260") The last use of rtsx_pci_transfer_data() was removed by 2024's commit d0f459259c13 ("memstick: rtsx_pci_ms: Remove Realtek PCI memstick driver") Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250420145739.58337-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/rtsx_pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h index 4612ef09a0c7..3b4c36705a9b 100644 --- a/include/linux/rtsx_pci.h +++ b/include/linux/rtsx_pci.h @@ -1312,8 +1312,6 @@ void rtsx_pci_add_cmd(struct rtsx_pcr *pcr, u8 cmd_type, u16 reg_addr, u8 mask, u8 data); void rtsx_pci_send_cmd_no_wait(struct rtsx_pcr *pcr); int rtsx_pci_send_cmd(struct rtsx_pcr *pcr, int timeout); -int rtsx_pci_transfer_data(struct rtsx_pcr *pcr, struct scatterlist *sglist, - int num_sg, bool read, int timeout); int rtsx_pci_dma_map_sg(struct rtsx_pcr *pcr, struct scatterlist *sglist, int num_sg, bool read); void rtsx_pci_dma_unmap_sg(struct rtsx_pcr *pcr, struct scatterlist *sglist, -- cgit v1.2.3 From 142ba31d8b4aff086c4f080bd97d437232922177 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 20 Apr 2025 12:19:09 +0800 Subject: PM: wakeup: Do not expose 4 device wakeup source APIs The following 4 APIs are only used by drivers/base/power/wakeup.c internally. - wakeup_source_create() - wakeup_source_destroy() - wakeup_source_add() - wakeup_source_remove() Do not expose them by making them as static functions. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250420-fix_power-v2-1-9b938d2283aa@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/pm_wakeup.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 51e0e8dd5f9e..c838b4a30f87 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -95,10 +95,6 @@ static inline void device_set_wakeup_path(struct device *dev) } /* drivers/base/power/wakeup.c */ -extern struct wakeup_source *wakeup_source_create(const char *name); -extern void wakeup_source_destroy(struct wakeup_source *ws); -extern void wakeup_source_add(struct wakeup_source *ws); -extern void wakeup_source_remove(struct wakeup_source *ws); extern struct wakeup_source *wakeup_source_register(struct device *dev, const char *name); extern void wakeup_source_unregister(struct wakeup_source *ws); @@ -129,17 +125,6 @@ static inline bool device_can_wakeup(struct device *dev) return dev->power.can_wakeup; } -static inline struct wakeup_source *wakeup_source_create(const char *name) -{ - return NULL; -} - -static inline void wakeup_source_destroy(struct wakeup_source *ws) {} - -static inline void wakeup_source_add(struct wakeup_source *ws) {} - -static inline void wakeup_source_remove(struct wakeup_source *ws) {} - static inline struct wakeup_source *wakeup_source_register(struct device *dev, const char *name) { -- cgit v1.2.3 From 477058411c45f225ddfbb4769e35a9a5a95cb826 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 25 Apr 2025 10:11:30 +0200 Subject: pidfs: register pid in pidfs Add simple helpers that allow a struct pid to be pinned via a pidfs dentry/inode. If no pidfs dentry exists a new one will be allocated for it. A reference is taken by pidfs on @pid. The reference must be released via pidfs_put_pid(). This will allow AF_UNIX sockets to allocate a dentry for the peer credentials pid at the time they are recorded where we know the task is still alive. When the task gets reaped its exit status is guaranteed to be recorded and a pidfd can be handed out for the reaped task. Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-1-450a19461e75@kernel.org Reviewed-by: Oleg Nesterov Reviewed-by: David Rheinsberg Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 05e6f8f4a026..2676890c4d0d 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -8,5 +8,8 @@ void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); void pidfs_exit(struct task_struct *tsk); extern const struct dentry_operations pidfs_dentry_operations; +int pidfs_register_pid(struct pid *pid); +void pidfs_get_pid(struct pid *pid); +void pidfs_put_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From a71f402acd71a942e59c16270ad61dee06de6e24 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 25 Apr 2025 10:11:32 +0200 Subject: pidfs: get rid of __pidfd_prepare() Fold it into pidfd_prepare() and rename PIDFD_CLONE to PIDFD_STALE to indicate that the passed pid might not have task linkage and no explicit check for that should be performed. Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-3-450a19461e75@kernel.org Reviewed-by: Oleg Nesterov Reviewed-by: David Rheinsberg Signed-off-by: Christian Brauner --- include/linux/pid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index 311ecebd7d56..453ae6d8a68d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -77,7 +77,7 @@ struct file; struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file); void do_notify_pidfd(struct task_struct *task); static inline struct pid *get_pid(struct pid *pid) -- cgit v1.2.3 From 6cccf837ac8d72e13c651f60d93545f9fb4e84ed Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 26 Apr 2025 11:21:19 +0200 Subject: Revert "vt: create ucs_recompose.c using gen_ucs_recompose.py" This reverts commit 54af55b990eda5a6a0140a3cded8094b42c0c3b7. A new version of the series was submitted, so it's easier to revert the old one and add the new one due to the changes invovled. Cc: Nicolas Pitre Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index 4d3a34c288e5..b3a911866662 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -30,7 +30,6 @@ int conv_uni_to_8bit(u32 uni); void console_map_init(void); bool ucs_is_double_width(uint32_t cp); bool ucs_is_zero_width(uint32_t cp); -uint32_t ucs_recompose(uint32_t base, uint32_t combining); #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -70,11 +69,6 @@ static inline bool ucs_is_zero_width(uint32_t cp) { return false; } - -static inline uint32_t ucs_recompose(uint32_t base, uint32_t combining) -{ - return 0; -} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From 67a4bb27461b0e3b39b27db688b5981b6eb62175 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 26 Apr 2025 11:21:21 +0200 Subject: Revert "vt: update ucs_width.c using gen_ucs_width.py" This reverts commit 3a1ab63aa05b4736a7d30ae0a769385662f13def. A new version of the series was submitted, so it's easier to revert the old one and add the new one due to the changes invovled. Cc: Nicolas Pitre Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index b3a911866662..7d778752dcef 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -29,7 +29,11 @@ u32 conv_8bit_to_uni(unsigned char c); int conv_uni_to_8bit(u32 uni); void console_map_init(void); bool ucs_is_double_width(uint32_t cp); -bool ucs_is_zero_width(uint32_t cp); +static inline bool ucs_is_zero_width(uint32_t cp) +{ + /* coming soon */ + return false; +} #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) -- cgit v1.2.3 From d3e92076c1af713e65edac109499c25c37f38c16 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 26 Apr 2025 11:21:24 +0200 Subject: Revert "vt: properly support zero-width Unicode code points" This reverts commit e88391f730e46d208b7fb37b02611d24137af1ef. A new version of the series was submitted, so it's easier to revert the old one and add the new one due to the changes invovled. Cc: Nicolas Pitre Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index 7d778752dcef..caf079bcb8c9 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -29,11 +29,6 @@ u32 conv_8bit_to_uni(unsigned char c); int conv_uni_to_8bit(u32 uni); void console_map_init(void); bool ucs_is_double_width(uint32_t cp); -static inline bool ucs_is_zero_width(uint32_t cp) -{ - /* coming soon */ - return false; -} #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -68,11 +63,6 @@ static inline bool ucs_is_double_width(uint32_t cp) { return false; } - -static inline bool ucs_is_zero_width(uint32_t cp) -{ - return false; -} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From e42e607aefc4132d508a0e5724b5d0975d0a53e8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 26 Apr 2025 11:21:25 +0200 Subject: Revert "vt: move unicode processing to a separate file" This reverts commit 2acaf27cd7f4f32bfe8bf7335690618e2417e744. A new version of the series was submitted, so it's easier to revert the old one and add the new one due to the changes invovled. Cc: Nicolas Pitre Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index caf079bcb8c9..c35db4896c37 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -28,7 +28,6 @@ int conv_uni_to_pc(struct vc_data *conp, long ucs); u32 conv_8bit_to_uni(unsigned char c); int conv_uni_to_8bit(u32 uni); void console_map_init(void); -bool ucs_is_double_width(uint32_t cp); #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -58,11 +57,6 @@ static inline int conv_uni_to_8bit(u32 uni) } static inline void console_map_init(void) { } - -static inline bool ucs_is_double_width(uint32_t cp) -{ - return false; -} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From 07bc3f442f47b4d158468c2e0146475bdf009091 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 17 Apr 2025 14:45:04 -0400 Subject: vt: move unicode processing to a separate file This will make it easier to maintain. Also make it depend on CONFIG_CONSOLE_TRANSLATIONS. Signed-off-by: Nicolas Pitre Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20250417184849.475581-3-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index c35db4896c37..caf079bcb8c9 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -28,6 +28,7 @@ int conv_uni_to_pc(struct vc_data *conp, long ucs); u32 conv_8bit_to_uni(unsigned char c); int conv_uni_to_8bit(u32 uni); void console_map_init(void); +bool ucs_is_double_width(uint32_t cp); #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -57,6 +58,11 @@ static inline int conv_uni_to_8bit(u32 uni) } static inline void console_map_init(void) { } + +static inline bool ucs_is_double_width(uint32_t cp) +{ + return false; +} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From 95b05de0a56699392e67590d000df76fedec609a Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 17 Apr 2025 14:45:05 -0400 Subject: vt: properly support zero-width Unicode code points Zero-width Unicode code points are causing misalignment in vertically aligned content, disrupting the visual layout. Let's handle zero-width code points more intelligently. Double-width code points are stored in the screen grid followed by a white space code point to create the expected screen layout. When a double-width code point is followed by a zero-width code point in the console incoming bytestream (e.g., an emoji with a presentation selector) then we may replace the white space padding by that zero-width code point instead of dropping it. This maximize screen content information while preserving proper layout. If a zero-width code point is preceded by a single-width code point then the above trick is not possible and such zero-width code point must be dropped. VS16 (Variation Selector 16, U+FE0F) is special as it typically doubles the width of the preceding single-width code point. We handle that case by giving VS16 a width of 1 instead of 0 when that happens. Signed-off-by: Nicolas Pitre Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20250417184849.475581-4-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index caf079bcb8c9..7d778752dcef 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -29,6 +29,11 @@ u32 conv_8bit_to_uni(unsigned char c); int conv_uni_to_8bit(u32 uni); void console_map_init(void); bool ucs_is_double_width(uint32_t cp); +static inline bool ucs_is_zero_width(uint32_t cp) +{ + /* coming soon */ + return false; +} #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -63,6 +68,11 @@ static inline bool ucs_is_double_width(uint32_t cp) { return false; } + +static inline bool ucs_is_zero_width(uint32_t cp) +{ + return false; +} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From 54cda9201c673fd1c5de189d961670999232e49d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 17 Apr 2025 14:45:08 -0400 Subject: vt: use new tables in ucs.c This removes the table from ucs.c and substitutes the generated tables from ucs_width_table.h providing comprehensive ranges for double-width and zero-width Unicode code points. Also implements ucs_is_zero_width() to query the new zero-width table. Signed-off-by: Nicolas Pitre Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20250417184849.475581-7-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index 7d778752dcef..b3a911866662 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -29,11 +29,7 @@ u32 conv_8bit_to_uni(unsigned char c); int conv_uni_to_8bit(u32 uni); void console_map_init(void); bool ucs_is_double_width(uint32_t cp); -static inline bool ucs_is_zero_width(uint32_t cp) -{ - /* coming soon */ - return false; -} +bool ucs_is_zero_width(uint32_t cp); #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) -- cgit v1.2.3 From b5c574995d842d241d810f3a6a3ebb03c52d57fa Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 17 Apr 2025 14:45:11 -0400 Subject: vt: support Unicode recomposition Try replacing any decomposed Unicode sequence by the corresponding recomposed code point. Code point to glyph correspondance works best after recomposition, and this apply mostly to single-width code points therefore we can't preserve them in their decomposed form anyway. Signed-off-by: Nicolas Pitre Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20250417184849.475581-10-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index b3a911866662..8167494229db 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -30,6 +30,7 @@ int conv_uni_to_8bit(u32 uni); void console_map_init(void); bool ucs_is_double_width(uint32_t cp); bool ucs_is_zero_width(uint32_t cp); +u32 ucs_recompose(u32 base, u32 mark); #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -69,6 +70,11 @@ static inline bool ucs_is_zero_width(uint32_t cp) { return false; } + +static inline u32 ucs_recompose(u32 base, u32 mark) +{ + return 0; +} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From f5e5631df596ade5875ba1dc5d640611745d0c0d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 20 Feb 2025 18:20:26 +0200 Subject: devres: Move devm_*_action*() APIs to devres.h We have a newly created header linux/device/devres.h that gathers device managed APIs, so users won't need to include entire device.h for only these ones. Move devm_*_action*() APIs to devres.h as well. Reviewed-by: Raag Jadav Signed-off-by: Andy Shevchenko Reviewed-by: Zijun Hu Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250220162238.2738038-2-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/device.h | 38 -------------------------------------- include/linux/device/devres.h | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 79e49fe494b7..4940db137fff 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -281,44 +281,6 @@ int __must_check device_create_bin_file(struct device *dev, void device_remove_bin_file(struct device *dev, const struct bin_attribute *attr); -/* allows to add/remove a custom action to devres stack */ -int devm_remove_action_nowarn(struct device *dev, void (*action)(void *), void *data); - -/** - * devm_remove_action() - removes previously added custom action - * @dev: Device that owns the action - * @action: Function implementing the action - * @data: Pointer to data passed to @action implementation - * - * Removes instance of @action previously added by devm_add_action(). - * Both action and data should match one of the existing entries. - */ -static inline -void devm_remove_action(struct device *dev, void (*action)(void *), void *data) -{ - WARN_ON(devm_remove_action_nowarn(dev, action, data)); -} - -void devm_release_action(struct device *dev, void (*action)(void *), void *data); - -int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name); -#define devm_add_action(dev, action, data) \ - __devm_add_action(dev, action, data, #action) - -static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(void *), - void *data, const char *name) -{ - int ret; - - ret = __devm_add_action(dev, action, data, name); - if (ret) - action(data); - - return ret; -} -#define devm_add_action_or_reset(dev, action, data) \ - __devm_add_action_or_reset(dev, action, data, #action) - /** * devm_alloc_percpu - Resource-managed alloc_percpu * @dev: Device to allocate per-cpu memory for diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h index 9b49f9915850..9cd1787ef28e 100644 --- a/include/linux/device/devres.h +++ b/include/linux/device/devres.h @@ -8,6 +8,7 @@ #include #include #include +#include struct device; struct device_node; @@ -126,4 +127,42 @@ void __iomem *devm_of_iomap(struct device *dev, struct device_node *node, int in #endif +/* allows to add/remove a custom action to devres stack */ +int devm_remove_action_nowarn(struct device *dev, void (*action)(void *), void *data); + +/** + * devm_remove_action() - removes previously added custom action + * @dev: Device that owns the action + * @action: Function implementing the action + * @data: Pointer to data passed to @action implementation + * + * Removes instance of @action previously added by devm_add_action(). + * Both action and data should match one of the existing entries. + */ +static inline +void devm_remove_action(struct device *dev, void (*action)(void *), void *data) +{ + WARN_ON(devm_remove_action_nowarn(dev, action, data)); +} + +void devm_release_action(struct device *dev, void (*action)(void *), void *data); + +int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name); +#define devm_add_action(dev, action, data) \ + __devm_add_action(dev, action, data, #action) + +static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(void *), + void *data, const char *name) +{ + int ret; + + ret = __devm_add_action(dev, action, data, name); + if (ret) + action(data); + + return ret; +} +#define devm_add_action_or_reset(dev, action, data) \ + __devm_add_action_or_reset(dev, action, data, #action) + #endif /* _DEVICE_DEVRES_H_ */ -- cgit v1.2.3 From e383bb8f958444620d96386811aacf6a49757996 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 20 Feb 2025 18:20:27 +0200 Subject: devres: Add devm_is_action_added() helper In some code we would like to know if the action in device managed resources was added by devm_add_action() family of calls. Introduce a helper for that. Reviewed-by: Raag Jadav Signed-off-by: Andy Shevchenko Reviewed-by: Zijun Hu Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250220162238.2738038-3-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/device/devres.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h index 9cd1787ef28e..ae696d10faff 100644 --- a/include/linux/device/devres.h +++ b/include/linux/device/devres.h @@ -165,4 +165,6 @@ static inline int __devm_add_action_or_reset(struct device *dev, void (*action)( #define devm_add_action_or_reset(dev, action, data) \ __devm_add_action_or_reset(dev, action, data, #action) +bool devm_is_action_added(struct device *dev, void (*action)(void *), void *data); + #endif /* _DEVICE_DEVRES_H_ */ -- cgit v1.2.3 From b15d97139ff14beb7c300f261e11d22d5a996941 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:11 +0200 Subject: mtd: spinand: Use more specific naming for the reset op SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really means by describing the expected bus topology in the reset macro name. Reviewed-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 311f145eb4e8..d1b9b630bd83 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -20,7 +20,7 @@ * Standard SPI NAND flash operations */ -#define SPINAND_RESET_OP \ +#define SPINAND_RESET_1S_0_0_OP \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xff, 1), \ SPI_MEM_OP_NO_ADDR, \ SPI_MEM_OP_NO_DUMMY, \ -- cgit v1.2.3 From 7c8896dd4a2a27c84b04dcf0990e6f6b118cb6b2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 18 Apr 2025 16:01:24 +0800 Subject: iommu: Remove IOMMU_DEV_FEAT_SVA None of the drivers implement anything here anymore, remove the dead code. Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Tested-by: Zhangfei Gao Link: https://lore.kernel.org/r/20250418080130.1844424-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 062818b51822..f39af28acaeb 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -318,18 +318,11 @@ struct iommu_iort_rmr_data { /** * enum iommu_dev_features - Per device IOMMU features - * @IOMMU_DEV_FEAT_SVA: Shared Virtual Addresses - * @IOMMU_DEV_FEAT_IOPF: I/O Page Faults such as PRI or Stall. Generally - * enabling %IOMMU_DEV_FEAT_SVA requires - * %IOMMU_DEV_FEAT_IOPF, but some devices manage I/O Page - * Faults themselves instead of relying on the IOMMU. When - * supported, this feature must be enabled before and - * disabled after %IOMMU_DEV_FEAT_SVA. + * @IOMMU_DEV_FEAT_IOPF: I/O Page Faults such as PRI or Stall. * * Device drivers enable a feature using iommu_dev_enable_feature(). */ enum iommu_dev_features { - IOMMU_DEV_FEAT_SVA, IOMMU_DEV_FEAT_IOPF, }; -- cgit v1.2.3 From f984fb09e60e0e2db5ad5be9d4cfcefb2008fda1 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 18 Apr 2025 16:01:30 +0800 Subject: iommu: Remove iommu_dev_enable/disable_feature() No external drivers use these interfaces anymore. Furthermore, no existing iommu drivers implement anything in the callbacks. Remove them to avoid dead code. Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Zhangfei Gao Reviewed-by: Nicolin Chen Link: https://lore.kernel.org/r/20250418080130.1844424-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f39af28acaeb..8d9119b000fe 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -316,16 +316,6 @@ struct iommu_iort_rmr_data { u32 num_sids; }; -/** - * enum iommu_dev_features - Per device IOMMU features - * @IOMMU_DEV_FEAT_IOPF: I/O Page Faults such as PRI or Stall. - * - * Device drivers enable a feature using iommu_dev_enable_feature(). - */ -enum iommu_dev_features { - IOMMU_DEV_FEAT_IOPF, -}; - #define IOMMU_NO_PASID (0U) /* Reserved for DMA w/o PASID */ #define IOMMU_FIRST_GLOBAL_PASID (1U) /*starting range for allocation */ #define IOMMU_PASID_INVALID (-1U) @@ -657,9 +647,6 @@ struct iommu_ops { bool (*is_attach_deferred)(struct device *dev); /* Per device IOMMU features */ - int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); - int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); - void (*page_response)(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg); @@ -1128,9 +1115,6 @@ void dev_iommu_priv_set(struct device *dev, void *priv); extern struct mutex iommu_probe_device_lock; int iommu_probe_device(struct device *dev); -int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); -int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); - int iommu_device_use_default_domain(struct device *dev); void iommu_device_unuse_default_domain(struct device *dev); @@ -1415,18 +1399,6 @@ static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, return -ENODEV; } -static inline int -iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat) -{ - return -ENODEV; -} - -static inline int -iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) -{ - return -ENODEV; -} - static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) { return NULL; -- cgit v1.2.3 From 0d609a1450fab636c825c66344ab0ecfc1d3a98c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:35:48 -0300 Subject: iommu: Add domain_alloc_identity() virtio-iommu has a mode where the IDENTITY domain is actually a paging domain with an identity mapping covering some of the system address space manually created. To support this add a new domain_alloc_identity() op that accepts the struct device so that virtio can allocate and fully finalize a paging domain to return. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/2-v4-ff5fb6b03bd1+288-iommu_virtio_domains_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8d9119b000fe..7b636b92ac7c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -567,6 +567,9 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * @domain_alloc: allocate and return an iommu domain if success. Otherwise * NULL is returned. The domain is not fully initialized until * the caller iommu_domain_alloc() returns. + * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to + * use identity_domain instead. This should only be used + * if dynamic logic is necessary. * @domain_alloc_paging_flags: Allocate an iommu domain corresponding to the * input parameters as defined in * include/uapi/linux/iommufd.h. The @user_data can be @@ -625,6 +628,7 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); + struct iommu_domain *(*domain_alloc_identity)(struct device *dev); struct iommu_domain *(*domain_alloc_paging_flags)( struct device *dev, u32 flags, const struct iommu_user_data *user_data); -- cgit v1.2.3 From 21c03574df19f0d77cb2e4d28bc02c79b21e656a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:35:51 -0300 Subject: iommu: Hide ops.domain_alloc behind CONFIG_FSL_PAMU fsl_pamu is the last user of domain_alloc(), and it is using it to create something weird that doesn't really fit into the iommu subsystem architecture. It is a not a paging domain since it doesn't have any map/unmap ops. It may be some special kind of identity domain. For now just leave it as is. Wrap it's definition in CONFIG_FSL_PAMU to discourage any new drivers from attempting to use it. Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/5-v4-ff5fb6b03bd1+288-iommu_virtio_domains_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7b636b92ac7c..5b64d0242e66 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -564,9 +564,7 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * op is allocated in the iommu driver and freed by the caller after * use. The information type is one of enum iommu_hw_info_type defined * in include/uapi/linux/iommufd.h. - * @domain_alloc: allocate and return an iommu domain if success. Otherwise - * NULL is returned. The domain is not fully initialized until - * the caller iommu_domain_alloc() returns. + * @domain_alloc: Do not use in new drivers * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to * use identity_domain instead. This should only be used * if dynamic logic is necessary. @@ -627,7 +625,9 @@ struct iommu_ops { void *(*hw_info)(struct device *dev, u32 *length, u32 *type); /* Domain allocation and freeing by the iommu driver */ +#if IS_ENABLED(CONFIG_FSL_PAMU) struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); +#endif struct iommu_domain *(*domain_alloc_identity)(struct device *dev); struct iommu_domain *(*domain_alloc_paging_flags)( struct device *dev, u32 flags, -- cgit v1.2.3 From da33e87bd2bfc63531cf7448a3cd7a3d42182f08 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 24 Apr 2025 18:41:28 +0100 Subject: iommu: Handle yet another race around registration Next up on our list of race windows to close is another one during iommu_device_register() - it's now OK again for multiple instances to run their bus_iommu_probe() in parallel, but an iommu_probe_device() can still also race against a running bus_iommu_probe(). As Johan has managed to prove, this has now become a lot more visible on DT platforms wth driver_async_probe where a client driver is attempting to probe in parallel with its IOMMU driver - although commit b46064a18810 ("iommu: Handle race with default domain setup") resolves this from the client driver's point of view, this isn't before of_iommu_configure() has had the chance to attempt to "replay" a probe that the bus walk hasn't even tried yet, and so still cause the out-of-order group allocation behaviour that we're trying to clean up (and now warning about). The most reliable thing to do here is to explicitly keep track of the "iommu_device_register() is still running" state, so we can then special-case the ops lookup for the replay path (based on dev->iommu again) to let that think it's still waiting for the IOMMU driver to appear at all. This still leaves the longstanding theoretical case of iommu_bus_notifier() being triggered during bus_iommu_probe(), but it's not so simple to defer a notifier, and nobody's ever reported that being a visible issue, so let's quietly kick that can down the road for now... Reported-by: Johan Hovold Fixes: bcb81ac6ae3c ("iommu: Get DT/ACPI parsing into the proper probe path") Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/88d54c1b48fed8279aa47d30f3d75173685bb26a.1745516488.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5b64d0242e66..bc5363e6b43e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -746,6 +746,7 @@ struct iommu_domain_ops { * @dev: struct device for sysfs handling * @singleton_group: Used internally for drivers that have only one group * @max_pasids: number of supported PASIDs + * @ready: set once iommu_device_register() has completed successfully */ struct iommu_device { struct list_head list; @@ -754,6 +755,7 @@ struct iommu_device { struct device *dev; struct iommu_group *singleton_group; u32 max_pasids; + bool ready; }; /** -- cgit v1.2.3 From 19da081a28c95fe9b03ce952a2bf4a6f6bf5112c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 23 Apr 2025 17:22:28 +0800 Subject: crypto: api - Add crypto_request_clone and fb Add a helper to clone crypto requests and eliminate code duplication. Use kmemdup in the helper. Also add an fb field to crypto_tfm. This also happens to fix the existing implementations which were buggy. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504230118.1CxUaUoX-lkp@intel.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504230004.c7mrY0C6-lkp@intel.com/ Signed-off-by: Herbert Xu --- include/linux/crypto.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index f691ce01745e..fe75320ff9a3 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include @@ -411,9 +411,11 @@ struct crypto_tfm { u32 crt_flags; int node; - + + struct crypto_tfm *fb; + void (*exit)(struct crypto_tfm *tfm); - + struct crypto_alg *__crt_alg; void *__crt_ctx[] CRYPTO_MINALIGN_ATTR; @@ -509,5 +511,8 @@ static inline void crypto_request_set_tfm(struct crypto_async_request *req, req->flags &= ~CRYPTO_TFM_REQ_ON_STACK; } +struct crypto_async_request *crypto_request_clone( + struct crypto_async_request *req, size_t total, gfp_t gfp); + #endif /* _LINUX_CRYPTO_H */ -- cgit v1.2.3 From b75fa20c127eb736b0ac9b30be051f526a2316a9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 25 Apr 2025 11:05:29 +0800 Subject: crypto: api - Add crypto_stack_request_init and initialise flags fully Add a helper to initialise crypto stack requests and use it for ahash and acomp. Make sure that the flags field is initialised fully in the helper to silence false-positive warnings from the compiler. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504250751.mdy28Ibr-lkp@intel.com/ Signed-off-by: Herbert Xu --- include/linux/crypto.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index fe75320ff9a3..b8d875b11193 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -514,5 +514,13 @@ static inline void crypto_request_set_tfm(struct crypto_async_request *req, struct crypto_async_request *crypto_request_clone( struct crypto_async_request *req, size_t total, gfp_t gfp); +static inline void crypto_stack_request_init(struct crypto_async_request *req, + struct crypto_tfm *tfm) +{ + req->flags = 0; + crypto_request_set_tfm(req, tfm); + req->flags |= CRYPTO_TFM_REQ_ON_STACK; +} + #endif /* _LINUX_CRYPTO_H */ -- cgit v1.2.3 From 1313057c369bb4da11bb1f8dffb570ac7b44a4d4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 31 Mar 2025 21:11:11 +0100 Subject: highmem: Add memcpy_folio() The folio equivalent of memcpy_page(). It should correctly and efficiently manage large folios: - If one, neither or both is highmem - If (either or both) offset+len crosses a page boundary - If the two offsets are congruent or not Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/highmem.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 5c6bea81a90e..fd72f66b872a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -404,6 +404,33 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off, kunmap_local(dst); } +static inline void memcpy_folio(struct folio *dst_folio, size_t dst_off, + struct folio *src_folio, size_t src_off, size_t len) +{ + VM_BUG_ON(dst_off + len > folio_size(dst_folio)); + VM_BUG_ON(src_off + len > folio_size(src_folio)); + + do { + char *dst = kmap_local_folio(dst_folio, dst_off); + const char *src = kmap_local_folio(src_folio, src_off); + size_t chunk = len; + + if (folio_test_highmem(dst_folio) && + chunk > PAGE_SIZE - offset_in_page(dst_off)) + chunk = PAGE_SIZE - offset_in_page(dst_off); + if (folio_test_highmem(src_folio) && + chunk > PAGE_SIZE - offset_in_page(src_off)) + chunk = PAGE_SIZE - offset_in_page(src_off); + memcpy(dst, src, chunk); + kunmap_local(src); + kunmap_local(dst); + + dst_off += chunk; + src_off += chunk; + len -= chunk; + } while (len > 0); +} + static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { -- cgit v1.2.3 From a510c186abfc870e5c74cf953b646c8a2c07bee1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Apr 2025 10:20:20 -0700 Subject: compiler_types: Identify compiler versions for __builtin_dynamic_object_size Clarify when __builtin_dynamic_object_size() is available. All our supported Clang versions support it. GCC 12 and later support it. Link to documentation for both. Acked-by: Miguel Ojeda Link: https://lore.kernel.org/r/20250416172016.work.154-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/compiler_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 501cffddc2f4..20881cc761fa 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -449,6 +449,11 @@ struct ftrace_likely_data { /* * When the size of an allocated object is needed, use the best available * mechanism to find it. (For cases where sizeof() cannot be used.) + * + * Optional: only supported since gcc >= 12 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Object-Size-Checking.html + * clang: https://clang.llvm.org/docs/LanguageExtensions.html#evaluating-object-size */ #if __has_builtin(__builtin_dynamic_object_size) #define __struct_size(p) __builtin_dynamic_object_size(p, 0) -- cgit v1.2.3 From 9a93048476e7cbdde00cdeebe66b6504995eac92 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Apr 2025 10:29:15 -0700 Subject: overflow: Clarify expectations for getting DEFINE_FLEX variable sizes Mention the use of __member_size() for DEFINE_FLEX variables as a hint for getting at the compile-time size of the resulting flexible array member. Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20250416172911.work.854-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/overflow.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 0c7e3dcfe867..6ee67c20b575 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -419,6 +419,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * Define a zeroed, on-stack, instance of @type structure with a trailing * flexible array member. * Use __struct_size(@name) to get compile-time size of it afterwards. + * Use __member_size(@name->member) to get compile-time size of @name members. */ #define DEFINE_RAW_FLEX(type, name, member, count) \ _DEFINE_FLEX(type, name, member, count, = {}) @@ -436,6 +437,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * Define a zeroed, on-stack, instance of @TYPE structure with a trailing * flexible array member. * Use __struct_size(@NAME) to get compile-time size of it afterwards. + * Use __member_size(@NAME->member) to get compile-time size of @NAME members. */ #define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT) \ _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .obj.COUNTER = COUNT, }) -- cgit v1.2.3 From 655862865c97ff55e4f3f2aaa7708f42f0ea3bd8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 15 Apr 2025 16:14:24 -0700 Subject: mod_devicetable: Enlarge the maximum platform_device_id name length The 20 byte length of struct platform_device_id::name is not long enough for many devices (especially regulators), where the string initialization is getting truncated and missing the trailing NUL byte. This is seen with GCC 15's -Wunterminated-string-initialization option: drivers/regulator/hi6421v530-regulator.c:189:19: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization] 189 | { .name = "hi6421v530-regulator" }, | ^~~~~~~~~~~~~~~~~~~~~~ drivers/regulator/hi6421v600-regulator.c:278:19: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization] 278 | { .name = "hi6421v600-regulator" }, | ^~~~~~~~~~~~~~~~~~~~~~ drivers/regulator/lp87565-regulator.c:233:11: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization] 233 | { "lp87565-q1-regulator", }, | ^~~~~~~~~~~~~~~~~~~~~~ sound/soc/fsl/imx-pcm-rpmsg.c:818:19: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization] 818 | { .name = "rpmsg-micfil-channel" }, | ^~~~~~~~~~~~~~~~~~~~~~ drivers/iio/light/hid-sensor-als.c:457:25: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization] 457 | .name = "HID-SENSOR-LISS-0041", | ^~~~~~~~~~~~~~~~~~~~~~ drivers/iio/light/hid-sensor-prox.c:366:25: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (21 chars into 20 available) [-Wunterminated-string-initialization] 366 | .name = "HID-SENSOR-LISS-0226", | ^~~~~~~~~~~~~~~~~~~~~~ Increase the length to 24, slightly more than is currently being used by the affected drivers. The string is used in '%s' format strings and via the module code, which appears to do its own length encoding. This size was chosen because there was already a 4 byte hole in the structure: struct platform_device_id { char name[20]; /* 0 20 */ /* XXX 4 bytes hole, try to pack */ kernel_ulong_t driver_data; /* 24 8 */ /* size: 32, cachelines: 1, members: 2 */ /* sum members: 28, holes: 1, sum holes: 4 */ /* last cacheline: 32 bytes */ }; Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250415231420.work.066-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index bd7e60c0b72f..ebcee9328168 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -601,7 +601,7 @@ struct dmi_system_id { #define DMI_MATCH(a, b) { .slot = a, .substr = b } #define DMI_EXACT_MATCH(a, b) { .slot = a, .substr = b, .exact_match = 1 } -#define PLATFORM_NAME_SIZE 20 +#define PLATFORM_NAME_SIZE 24 #define PLATFORM_MODULE_PREFIX "platform:" struct platform_device_id { -- cgit v1.2.3 From c1f7375a246e5f810191a6c3031d2fa2b80e9f5e Mon Sep 17 00:00:00 2001 From: Jelle van der Waa Date: Mon, 14 Apr 2025 15:18:40 +0200 Subject: power: supply: support charge_types in extensions Similar to charge_behaviour, charge_types is an enum option where reading the property shows the supported values, with the active value surrounded by brackets. To be able to use it with a power_supply extension a bitmask with the supported charge_Types values has to be added to power_supply_ext. Signed-off-by: Jelle van der Waa Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20250414131840.382756-2-jvanderwaa@redhat.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 888824592953..c4cb854971f5 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -288,6 +288,7 @@ struct power_supply_desc { struct power_supply_ext { const char *const name; u8 charge_behaviours; + u32 charge_types; const enum power_supply_property *properties; size_t num_properties; -- cgit v1.2.3 From e43b8bb56e537bfc8d9076793091e7679020fc9c Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Thu, 20 Mar 2025 10:29:24 -0700 Subject: entry: Inline syscall_exit_to_user_mode() Similar to commit 221a164035fd ("entry: Move syscall_enter_from_user_mode() to header file"), move syscall_exit_to_user_mode() to the header file as well. Testing was done with the byte-unixbench syscall benchmark (which calls getpid) and QEMU. On riscv I measured a 7.09246% improvement, on x86 a 2.98843% improvement, on loongarch a 6.07954% improvement, and on s390 a 11.1328% improvement. The Intel bot also reported "kernel test robot noticed a 1.9% improvement of stress-ng.seek.ops_per_sec". Signed-off-by: Charlie Jenkins Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/all/20250320-riscv_optimize_entry-v6-4-63e187e26041@rivosinc.com Link: https://lore.kernel.org/linux-riscv/202502051555.85ae6844-lkp@intel.com/ --- include/linux/entry-common.h | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index fc61d0205c97..f94f3fdf15fc 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -14,6 +14,7 @@ #include #include +#include /* * Define dummy _TIF work flags if not defined by the architecture or for @@ -366,6 +367,15 @@ static __always_inline void exit_to_user_mode(void) lockdep_hardirqs_on(CALLER_ADDR0); } +/** + * syscall_exit_work - Handle work before returning to user mode + * @regs: Pointer to current pt_regs + * @work: Current thread syscall work + * + * Do one-time syscall specific work. + */ +void syscall_exit_work(struct pt_regs *regs, unsigned long work); + /** * syscall_exit_to_user_mode_work - Handle work before returning to user mode * @regs: Pointer to currents pt_regs @@ -379,7 +389,30 @@ static __always_inline void exit_to_user_mode(void) * make the final state transitions. Interrupts must stay disabled between * return from this function and the invocation of exit_to_user_mode(). */ -void syscall_exit_to_user_mode_work(struct pt_regs *regs); +static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) +{ + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); + unsigned long nr = syscall_get_nr(current, regs); + + CT_WARN_ON(ct_state() != CT_STATE_KERNEL); + + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) + local_irq_enable(); + } + + rseq_syscall(regs); + + /* + * Do one-time syscall specific work. If these work items are + * enabled, we want to run them exactly once per syscall exit with + * interrupts enabled. + */ + if (unlikely(work & SYSCALL_WORK_EXIT)) + syscall_exit_work(regs, work); + local_irq_disable_exit_to_user(); + exit_to_user_mode_prepare(regs); +} /** * syscall_exit_to_user_mode - Handle work before returning to user mode @@ -410,7 +443,13 @@ void syscall_exit_to_user_mode_work(struct pt_regs *regs); * exit_to_user_mode(). This function is preferred unless there is a * compelling architectural reason to use the separate functions. */ -void syscall_exit_to_user_mode(struct pt_regs *regs); +static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + syscall_exit_to_user_mode_work(regs); + instrumentation_end(); + exit_to_user_mode(); +} /** * irqentry_enter_from_user_mode - Establish state before invoking the irq handler -- cgit v1.2.3 From d54e34c58aa224bdd0b9ea0f429c5a90d91db2c7 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:12 +0200 Subject: mtd: spinand: Use more specific naming for the write enable/disable op SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really means by describing the expected bus topology in the write enable/disable macro names. Reviewed-by: Tudor Ambarus [Miquel: Fixed conflicts with -next by updating esmt and micron drivers] Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index d1b9b630bd83..87d8f9f2017e 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -26,7 +26,7 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPINAND_WR_EN_DIS_OP(enable) \ +#define SPINAND_WR_EN_DIS_1S_0_0_OP(enable) \ SPI_MEM_OP(SPI_MEM_OP_CMD((enable) ? 0x06 : 0x04, 1), \ SPI_MEM_OP_NO_ADDR, \ SPI_MEM_OP_NO_DUMMY, \ -- cgit v1.2.3 From 2a294fa215289aff4b7b0c0a70f24dcd621df497 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:13 +0200 Subject: mtd: spinand: Use more specific naming for the read ID op SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really means by describing the expected bus topology in the read ID macro name. Reviewed-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 87d8f9f2017e..dfb5c74cad6d 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -32,7 +32,7 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPINAND_READID_OP(naddr, ndummy, buf, len) \ +#define SPINAND_READID_1S_1S_1S_OP(naddr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x9f, 1), \ SPI_MEM_OP_ADDR(naddr, 0, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ -- cgit v1.2.3 From 429330cd1cfe860133d1fbdd49e9e081524333cf Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:14 +0200 Subject: mtd: spinand: Use more specific naming for the get/set feature ops SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really mean by describing the expected bus topology in the get/set feature macro names. Reviewed-by: Tudor Ambarus [Miquel: Fixed conflicts with -next by updating macronix driver] Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index dfb5c74cad6d..8ea40d838dc2 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -38,13 +38,13 @@ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) -#define SPINAND_SET_FEATURE_OP(reg, valptr) \ +#define SPINAND_SET_FEATURE_1S_1S_1S_OP(reg, valptr) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x1f, 1), \ SPI_MEM_OP_ADDR(1, reg, 1), \ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_DATA_OUT(1, valptr, 1)) -#define SPINAND_GET_FEATURE_OP(reg, valptr) \ +#define SPINAND_GET_FEATURE_1S_1S_1S_OP(reg, valptr) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x0f, 1), \ SPI_MEM_OP_ADDR(1, reg, 1), \ SPI_MEM_OP_NO_DUMMY, \ -- cgit v1.2.3 From 7e8533b273ee9e7243cc57afec835c20135ebbb2 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:15 +0200 Subject: mtd: spinand: Use more specific naming for the erase op SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really means by describing the expected bus topology in the erase macro name. Reviewed-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 8ea40d838dc2..964f1244d6a6 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -50,7 +50,7 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_DATA_IN(1, valptr, 1)) -#define SPINAND_BLK_ERASE_OP(addr) \ +#define SPINAND_BLK_ERASE_1S_1S_0_OP(addr) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xd8, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ SPI_MEM_OP_NO_DUMMY, \ -- cgit v1.2.3 From 7528c97c0c2ac4c6c09b5aae52382958a57122fe Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:16 +0200 Subject: mtd: spinand: Use more specific naming for the page read op SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really means by describing the expected bus topology in the page read macro name. Reviewed-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 964f1244d6a6..df07bc55aa4d 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -56,7 +56,7 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPINAND_PAGE_READ_OP(addr) \ +#define SPINAND_PAGE_READ_1S_1S_0_OP(addr) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x13, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ SPI_MEM_OP_NO_DUMMY, \ -- cgit v1.2.3 From ea2087d4e66d0b927918cc9048576aca6a0446ad Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:17 +0200 Subject: mtd: spinand: Use more specific naming for the (single) read from cache ops SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really mean by describing the expected bus topology in the (single) read from cache macro names. Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index df07bc55aa4d..c702e1ed8fe8 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -62,32 +62,32 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPINAND_PAGE_READ_FROM_CACHE_OP(addr, ndummy, buf, len, ...) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(addr, ndummy, buf, len, ...) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1), \ SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0)) -#define SPINAND_PAGE_READ_FROM_CACHE_FAST_OP(addr, ndummy, buf, len) \ - SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ +#define SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) -#define SPINAND_PAGE_READ_FROM_CACHE_OP_3A(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_1S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) -#define SPINAND_PAGE_READ_FROM_CACHE_FAST_OP_3A(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_FAST_3A_1S_1S_1S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) -#define SPINAND_PAGE_READ_FROM_CACHE_DTR_OP(addr, ndummy, buf, len, freq) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_1D_1D_OP(addr, ndummy, buf, len, freq) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x0d, 1), \ SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ -- cgit v1.2.3 From 684f7105e8534f6500de389c089ba204cb1c8058 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:18 +0200 Subject: mtd: spinand: Use more specific naming for the (dual output) read from cache ops SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really mean by describing the expected bus topology in the (dual output) read from cache macro names. Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index c702e1ed8fe8..f84991d05d85 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -94,19 +94,19 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 1), \ SPI_MEM_OP_MAX_FREQ(freq)) -#define SPINAND_PAGE_READ_FROM_CACHE_X2_OP(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 2)) -#define SPINAND_PAGE_READ_FROM_CACHE_X2_OP_3A(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_2S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 2)) -#define SPINAND_PAGE_READ_FROM_CACHE_X2_DTR_OP(addr, ndummy, buf, len, freq) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_1D_2D_OP(addr, ndummy, buf, len, freq) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x3d, 1), \ SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ -- cgit v1.2.3 From d9de177996d74c0cc44220003953ba2d2bece0ac Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:19 +0200 Subject: mtd: spinand: Use more specific naming for the (dual IO) read from cache ops SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really mean by describing the expected bus topology in the (dual IO) read from cache macro names. While at modifying them, better reordering the macros to group them all by bus topology which now feels more intuitive. Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index f84991d05d85..a8a963f57d43 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -113,6 +113,25 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ SPI_MEM_OP_MAX_FREQ(freq)) +#define SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ + SPI_MEM_OP_ADDR(2, addr, 2), \ + SPI_MEM_OP_DUMMY(ndummy, 2), \ + SPI_MEM_OP_DATA_IN(len, buf, 2)) + +#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_2S_2S_OP(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ + SPI_MEM_OP_ADDR(3, addr, 2), \ + SPI_MEM_OP_DUMMY(ndummy, 2), \ + SPI_MEM_OP_DATA_IN(len, buf, 2)) + +#define SPINAND_PAGE_READ_FROM_CACHE_1S_2D_2D_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0xbd, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 2), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 2), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PAGE_READ_FROM_CACHE_X4_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ @@ -132,25 +151,6 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ SPI_MEM_OP_MAX_FREQ(freq)) -#define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(addr, ndummy, buf, len) \ - SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ - SPI_MEM_OP_ADDR(2, addr, 2), \ - SPI_MEM_OP_DUMMY(ndummy, 2), \ - SPI_MEM_OP_DATA_IN(len, buf, 2)) - -#define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP_3A(addr, ndummy, buf, len) \ - SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ - SPI_MEM_OP_ADDR(3, addr, 2), \ - SPI_MEM_OP_DUMMY(ndummy, 2), \ - SPI_MEM_OP_DATA_IN(len, buf, 2)) - -#define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_DTR_OP(addr, ndummy, buf, len, freq) \ - SPI_MEM_OP(SPI_MEM_OP_CMD(0xbd, 1), \ - SPI_MEM_DTR_OP_ADDR(2, addr, 2), \ - SPI_MEM_DTR_OP_DUMMY(ndummy, 2), \ - SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ - SPI_MEM_OP_MAX_FREQ(freq)) - #define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1), \ SPI_MEM_OP_ADDR(2, addr, 4), \ -- cgit v1.2.3 From 1deae734cc1c7e976d588e7d8f46af2bb9ef5656 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:20 +0200 Subject: mtd: spinand: Use more specific naming for the (quad output) read from cache ops SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really mean by describing the expected bus topology in the (quad output) read from cache macro names. Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index a8a963f57d43..cbeec0f53f88 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -132,19 +132,19 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ SPI_MEM_OP_MAX_FREQ(freq)) -#define SPINAND_PAGE_READ_FROM_CACHE_X4_OP(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 4)) -#define SPINAND_PAGE_READ_FROM_CACHE_X4_OP_3A(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_4S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 4)) -#define SPINAND_PAGE_READ_FROM_CACHE_X4_DTR_OP(addr, ndummy, buf, len, freq) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_1D_4D_OP(addr, ndummy, buf, len, freq) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x6d, 1), \ SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ -- cgit v1.2.3 From 9c6911072c6e8b128ccdb7dd00efa13c47513074 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:21 +0200 Subject: mtd: spinand: Use more specific naming for the (quad IO) read from cache ops SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really mean by describing the expected bus topology in the (quad IO) read from cache macro names. Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index cbeec0f53f88..70f3e69e56b8 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -151,19 +151,19 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ SPI_MEM_OP_MAX_FREQ(freq)) -#define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1), \ SPI_MEM_OP_ADDR(2, addr, 4), \ SPI_MEM_OP_DUMMY(ndummy, 4), \ SPI_MEM_OP_DATA_IN(len, buf, 4)) -#define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP_3A(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_4S_4S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1), \ SPI_MEM_OP_ADDR(3, addr, 4), \ SPI_MEM_OP_DUMMY(ndummy, 4), \ SPI_MEM_OP_DATA_IN(len, buf, 4)) -#define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_DTR_OP(addr, ndummy, buf, len, freq) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_4D_4D_OP(addr, ndummy, buf, len, freq) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xed, 1), \ SPI_MEM_DTR_OP_ADDR(2, addr, 4), \ SPI_MEM_DTR_OP_DUMMY(ndummy, 4), \ -- cgit v1.2.3 From 36e461894cf31e33ebd869f3fd85c5d7c68a22bc Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:22 +0200 Subject: mtd: spinand: Use more specific naming for the program execution op SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really means by describing the expected bus topology in the program execution macro name. Acked-by: Tudor Ambarus [Miquel: Fixed conflicts with -next by updating esmt and micron drivers] Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 70f3e69e56b8..d84206b9e824 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -170,7 +170,7 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ SPI_MEM_OP_MAX_FREQ(freq)) -#define SPINAND_PROG_EXEC_OP(addr) \ +#define SPINAND_PROG_EXEC_1S_1S_0_OP(addr) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x10, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ SPI_MEM_OP_NO_DUMMY, \ -- cgit v1.2.3 From 07cdbae7f84190983f9b07133ce5b6f2634c95cd Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:23 +0200 Subject: mtd: spinand: Use more specific naming for the (single) program load op SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really means by describing the expected bus topology in the (single) program load macro name. While at modifying it, better add the missing_ OP suffix to align with all the other macros of the same kind. Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index d84206b9e824..d1cc2cdbd4a1 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -176,7 +176,7 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPINAND_PROG_LOAD(reset, addr, buf, len) \ +#define SPINAND_PROG_LOAD_1S_1S_1S_OP(reset, addr, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(reset ? 0x02 : 0x84, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_NO_DUMMY, \ -- cgit v1.2.3 From ac3a4b17e03b079c00eb61456364bcdbf65f3436 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:24 +0200 Subject: mtd: spinand: Use more specific naming for the (quad) program load op SPI operations have been initially described through macros implicitly implying the use of a single SPI SDR bus. Macros for supporting dual and quad I/O transfers have been added on top, generally inspired by vendor naming, followed by DTR operations. Soon we might see octal and even octal DTR operations as well (including the opcode byte). Let's clarify what the macro really means by describing the expected bus topology in the (quad) program load macro name. While at modifying it, better add the missing_ OP suffix to align with all the other macros of the same kind. Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index d1cc2cdbd4a1..c70d17e0b308 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -182,7 +182,7 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_DATA_OUT(len, buf, 1)) -#define SPINAND_PROG_LOAD_X4(reset, addr, buf, len) \ +#define SPINAND_PROG_LOAD_1S_1S_4S_OP(reset, addr, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(reset ? 0x32 : 0x34, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_NO_DUMMY, \ -- cgit v1.2.3 From 51b252cce172cbfb21dfd5e544dcbefc649f3daa Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 3 Apr 2025 11:19:25 +0200 Subject: mtd: spinand: Define octal operations SPI NAND chips may support octal "read from cache" and "program load" transfers. List the opcodes by defining the relevant macros describing these operations. However, due to the hardware available I had, 0x82 and 0xc2 are untested and given as reference, only 0xc4 could be (successfully) tested. Controllers supporting operations mixing SDR and DTR operations might even leverage octal DTR data I/O transfers. Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index c70d17e0b308..811a0f356315 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -170,6 +170,27 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ SPI_MEM_OP_MAX_FREQ(freq)) +#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_8S_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x8b, 1), \ + SPI_MEM_OP_ADDR(2, addr, 1), \ + SPI_MEM_OP_DUMMY(ndummy, 1), \ + SPI_MEM_OP_DATA_IN(len, buf, 8), \ + SPI_MEM_OP_MAX_FREQ(freq)) + +#define SPINAND_PAGE_READ_FROM_CACHE_1S_8S_8S_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0xcb, 1), \ + SPI_MEM_OP_ADDR(2, addr, 8), \ + SPI_MEM_OP_DUMMY(ndummy, 8), \ + SPI_MEM_OP_DATA_IN(len, buf, 8), \ + SPI_MEM_OP_MAX_FREQ(freq)) + +#define SPINAND_PAGE_READ_FROM_CACHE_1S_1D_8D_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x9d, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 8), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PROG_EXEC_1S_1S_0_OP(addr) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x10, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ @@ -188,6 +209,18 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_DATA_OUT(len, buf, 4)) +#define SPINAND_PROG_LOAD_1S_1S_8S_OP(addr, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x82, 1), \ + SPI_MEM_OP_ADDR(2, addr, 1), \ + SPI_MEM_OP_NO_DUMMY, \ + SPI_MEM_OP_DATA_OUT(len, buf, 8)) + +#define SPINAND_PROG_LOAD_1S_8S_8S_OP(reset, addr, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(reset ? 0xc2 : 0xc4, 1), \ + SPI_MEM_OP_ADDR(2, addr, 8), \ + SPI_MEM_OP_NO_DUMMY, \ + SPI_MEM_OP_DATA_OUT(len, buf, 8)) + /** * Standard SPI NAND flash commands */ -- cgit v1.2.3 From ee000969f28bf579d3772bf7c0ae8aff86586e20 Mon Sep 17 00:00:00 2001 From: Md Sadre Alam Date: Thu, 10 Apr 2025 15:30:17 +0530 Subject: mtd: rawnand: qcom: Pass 18 bit offset from NANDc base to BAM base The BAM command descriptor provides only 18 bits to specify the BAM register offset. Additionally, in the BAM command descriptor, the BAM register offset is supposed to be specified as "(NANDc base - BAM base) + reg_off". Since, the BAM controller expecting the value in the form of "NANDc base - BAM base", so that added a new field 'bam_offset' in the NAND properties structure and use it while preparing the command descriptor. Previously, the driver was specifying the NANDc base address in the BAM command descriptor. Cc: stable@vger.kernel.org Fixes: 8d6b6d7e135e ("mtd: nand: qcom: support for command descriptor formation") Tested-by: Lakshmi Sowjanya D Signed-off-by: Md Sadre Alam Acked-by: Mark Brown Tested-by: Gabor Juhos # on IPQ9574 Reviewed-by: Manivannan Sadhasivam Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h index cd7172e6c1bb..e8462deda6db 100644 --- a/include/linux/mtd/nand-qpic-common.h +++ b/include/linux/mtd/nand-qpic-common.h @@ -199,9 +199,6 @@ */ #define dev_cmd_reg_addr(nandc, reg) ((nandc)->props->dev_cmd_reg_start + (reg)) -/* Returns the NAND register physical address */ -#define nandc_reg_phys(chip, offset) ((chip)->base_phys + (offset)) - /* Returns the dma address for reg read buffer */ #define reg_buf_dma_addr(chip, vaddr) \ ((chip)->reg_read_dma + \ @@ -454,6 +451,7 @@ struct qcom_nand_controller { struct qcom_nandc_props { u32 ecc_modes; u32 dev_cmd_reg_start; + u32 bam_offset; bool supports_bam; bool nandc_part_of_qpic; bool qpic_version2; -- cgit v1.2.3 From 73db799bf5efc5a04654bb3ff6c9bf63a0dfa473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bence=20Cs=C3=B3k=C3=A1s?= Date: Thu, 27 Mar 2025 20:59:26 +0100 Subject: PM: runtime: Add new devm functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `devm_pm_runtime_set_active_enabled()` and `devm_pm_runtime_get_noresume()` for simplifying common cases in drivers. Signed-off-by: Bence Csókás Link: https://patch.msgid.link/20250327195928.680771-3-csokas.bence@prolan.hu Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 7fb5a459847e..756b842dcd30 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -96,7 +96,9 @@ extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device_link *link); extern void pm_runtime_release_supplier(struct device_link *link); +int devm_pm_runtime_set_active_enabled(struct device *dev); extern int devm_pm_runtime_enable(struct device *dev); +int devm_pm_runtime_get_noresume(struct device *dev); /** * pm_suspend_ignore_children - Set runtime PM behavior regarding children. @@ -294,7 +296,9 @@ static inline bool pm_runtime_blocked(struct device *dev) { return true; } static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} +static inline int devm_pm_runtime_set_active_enabled(struct device *dev) { return 0; } static inline int devm_pm_runtime_enable(struct device *dev) { return 0; } +static inline int devm_pm_runtime_get_noresume(struct device *dev) { return 0; } static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {} static inline void pm_runtime_get_noresume(struct device *dev) {} -- cgit v1.2.3 From 9f52aecc952ddf307571517d5c91136c8c4e87c9 Mon Sep 17 00:00:00 2001 From: Junhao He Date: Wed, 18 Sep 2024 11:53:27 +0800 Subject: coresight: Fixes device's owner field for registered using coresight_init_driver() The coresight_init_driver() of the coresight-core module is called from the sub coresgiht device (such as tmc/stm/funnle/...) module. It calls amba_driver_register() and Platform_driver_register(), which are macro functions that use the coresight-core's module to initialize the caller's owner field. Therefore, when the sub coresight device calls coresight_init_driver(), an incorrect THIS_MODULE value is captured. The sub coesgiht modules can be removed while their callbacks are running, resulting in a general protection failure. Add module parameter to coresight_init_driver() so can be called with the module of the callback. Fixes: 075b7cd7ad7d ("coresight: Add helpers registering/removing both AMBA and platform drivers") Signed-off-by: Junhao He Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20240918035327.9710-1-hejunhao3@huawei.com --- include/linux/coresight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index d79a242b271d..cfcf6e4707ed 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -723,7 +723,7 @@ coresight_find_output_type(struct coresight_platform_data *pdata, union coresight_dev_subtype subtype); int coresight_init_driver(const char *drv, struct amba_driver *amba_drv, - struct platform_driver *pdev_drv); + struct platform_driver *pdev_drv, struct module *owner); void coresight_remove_driver(struct amba_driver *amba_drv, struct platform_driver *pdev_drv); -- cgit v1.2.3 From 0091d9241ea24c5275be4a3e5a032862fd9de9ec Mon Sep 17 00:00:00 2001 From: Steven Chen Date: Mon, 21 Apr 2025 15:25:09 -0700 Subject: kexec: define functions to map and unmap segments Implement kimage_map_segment() to enable IMA to map the measurement log list to the kimage structure during the kexec 'load' stage. This function gathers the source pages within the specified address range, and maps them to a contiguous virtual address range. This is a preparation for later usage. Implement kimage_unmap_segment() for unmapping segments using vunmap(). Cc: Eric Biederman Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Co-developed-by: Tushar Sugandhi Signed-off-by: Tushar Sugandhi Signed-off-by: Steven Chen Acked-by: Baoquan He Tested-by: Stefan Berger # ppc64/kvm Signed-off-by: Mimi Zohar --- include/linux/kexec.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index c8971861521a..805743208d19 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -474,13 +474,19 @@ extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, arg...) \ do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0) +extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size); +extern void kimage_unmap_segment(void *buffer); #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; +struct kimage; static inline void __crash_kexec(struct pt_regs *regs) { } static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } static inline int kexec_crash_loaded(void) { return 0; } +static inline void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size) +{ return NULL; } +static inline void kimage_unmap_segment(void *buffer) { } #define kexec_in_progress false #endif /* CONFIG_KEXEC_CORE */ -- cgit v1.2.3 From 9ee8888a80fe2bd20ce929ffbc1dedd57607a778 Mon Sep 17 00:00:00 2001 From: Steven Chen Date: Mon, 21 Apr 2025 15:25:10 -0700 Subject: ima: kexec: skip IMA segment validation after kexec soft reboot Currently, the function kexec_calculate_store_digests() calculates and stores the digest of the segment during the kexec_file_load syscall, where the IMA segment is also allocated. Later, the IMA segment will be updated with the measurement log at the kexec execute stage when a kexec reboot is initiated. Therefore, the digests should be updated for the IMA segment in the normal case. The problem is that the content of memory segments carried over to the new kernel during the kexec systemcall can be changed at kexec 'execute' stage, but the size and the location of the memory segments cannot be changed at kexec 'execute' stage. To address this, skip the calculation and storage of the digest for the IMA segment in kexec_calculate_store_digests() so that it is not added to the purgatory_sha_regions. With this change, the IMA segment is not included in the digest calculation, storage, and verification. Cc: Eric Biederman Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Co-developed-by: Tushar Sugandhi Signed-off-by: Tushar Sugandhi Signed-off-by: Steven Chen Reviewed-by: Stefan Berger Acked-by: Baoquan He Tested-by: Stefan Berger # ppc64/kvm [zohar@linux.ibm.com: Fixed Signed-off-by tag to match author's email ] Signed-off-by: Mimi Zohar --- include/linux/kexec.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 805743208d19..53ef1b6c8712 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -369,6 +369,9 @@ struct kimage { phys_addr_t ima_buffer_addr; size_t ima_buffer_size; + + unsigned long ima_segment_index; + bool is_ima_segment_index_set; #endif /* Core ELF header buffer */ -- cgit v1.2.3 From f18e502db673c75f762d47101dafcf58f30e2733 Mon Sep 17 00:00:00 2001 From: Steven Chen Date: Mon, 21 Apr 2025 15:25:11 -0700 Subject: ima: kexec: define functions to copy IMA log at soft boot The IMA log is currently copied to the new kernel during kexec 'load' using ima_dump_measurement_list(). However, the log copied at kexec 'load' may result in loss of IMA measurements that only occurred after kexec "load'. Setup the needed infrastructure to move the IMA log copy from kexec 'load' to 'execute'. Define a new IMA hook ima_update_kexec_buffer() as a stub function. It will be used to call ima_dump_measurement_list() during kexec 'execute'. Implement ima_kexec_post_load() function to be invoked after the new Kernel image has been loaded for kexec. ima_kexec_post_load() maps the IMA buffer to a segment in the newly loaded Kernel. It also registers the reboot notifier_block to trigger ima_update_kexec_buffer() at kexec 'execute'. Set the priority of register_reboot_notifier to INT_MIN to ensure that the IMA log copy operation will happen at the end of the operation chain, so that all the IMA measurement records extended into the TPM are copied Co-developed-by: Tushar Sugandhi Signed-off-by: Tushar Sugandhi Cc: Eric Biederman Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Signed-off-by: Steven Chen Reviewed-by: Stefan Berger Acked-by: Baoquan He Tested-by: Stefan Berger # ppc64/kvm Signed-off-by: Mimi Zohar --- include/linux/ima.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index 0bae61a15b60..8e29cb4e6a01 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -32,6 +32,9 @@ static inline void ima_appraise_parse_cmdline(void) {} #ifdef CONFIG_IMA_KEXEC extern void ima_add_kexec_buffer(struct kimage *image); +extern void ima_kexec_post_load(struct kimage *image); +#else +static inline void ima_kexec_post_load(struct kimage *image) {} #endif #else -- cgit v1.2.3 From 74a2bd0bfb0675cb8f276f2fe3ede0c7f9e78fa3 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 12 Mar 2025 22:19:49 -0400 Subject: nodemask: drop nodes_shift nodes_shift_{left,right} are not used. Drop them. Signed-off-by: Yury Norov [NVIDIA] --- include/linux/nodemask.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index f0ac0633366b..0aa6b63aa747 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -39,9 +39,6 @@ * int nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * - * void nodes_shift_right(dst, src, n) Shift right - * void nodes_shift_left(dst, src, n) Shift left - * * unsigned int first_node(mask) Number lowest set bit, or MAX_NUMNODES * unsigend int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first, @@ -247,22 +244,6 @@ static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int n return bitmap_weight(srcp->bits, nbits); } -#define nodes_shift_right(dst, src, n) \ - __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) -static __always_inline void __nodes_shift_right(nodemask_t *dstp, - const nodemask_t *srcp, int n, int nbits) -{ - bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); -} - -#define nodes_shift_left(dst, src, n) \ - __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) -static __always_inline void __nodes_shift_left(nodemask_t *dstp, - const nodemask_t *srcp, int n, int nbits) -{ - bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); -} - /* FIXME: better would be to fix all architectures to never return > MAX_NUMNODES, then the silly min_ts could be dropped. */ -- cgit v1.2.3 From 4923c2c5b66fe8dea1df5e16d61f15c1dbea5ba1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 12 Mar 2025 22:19:50 -0400 Subject: cpumask: add non-atomic __assign_cpu() Similarly to atomic, add a non-atomic version. Signed-off-by: Yury Norov [NVIDIA] --- include/linux/cpumask.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index f9a868384083..ba1e232e0200 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1074,6 +1074,9 @@ void init_cpu_possible(const struct cpumask *src); #define assign_cpu(cpu, mask, val) \ assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) +#define __assign_cpu(cpu, mask, val) \ + __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) + #define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) -- cgit v1.2.3 From 791a9b25ce2e6ecbe404ee32eed8a96a17e52896 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 12 Mar 2025 22:19:52 -0400 Subject: cpumask: drop cpumask_assign_cpu() Commit decde1fa209323c7 ("cpumask: Add assign cpu") was merged bypassing cpumasks reviewers. It adds atomic and non-atomic cpumask_assign_cpu() helpers. In the same merge window, commit 5c563ee90a22d3 ("cpumask: introduce assign_cpu() macro") added the same functionality. So now we have it duplicated. __cpumask_assign_cpu() has never been used since introducing, and because this series reworks the only user of cpumask_assign_cpu(), both functions become a dead code. Signed-off-by: Yury Norov [NVIDIA] --- include/linux/cpumask.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ba1e232e0200..beff4d26e605 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -558,22 +558,6 @@ static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } -/** - * cpumask_assign_cpu - assign a cpu in a cpumask - * @cpu: cpu number (< nr_cpu_ids) - * @dstp: the cpumask pointer - * @bool: the value to assign - */ -static __always_inline void cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value) -{ - assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value); -} - -static __always_inline void __cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value) -{ - __assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value); -} - /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) -- cgit v1.2.3 From 31299a5e0211241171b2222c5633aad4763bf700 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Mar 2025 00:59:56 +0900 Subject: bits: add comments and newlines to #if, #else and #endif directives This is a preparation for the upcoming GENMASK_U*() and BIT_U*() changes. After introducing those new macros, there will be a lot of scrolling between the #if, #else and #endif. Add a comment to the #else and #endif preprocessor macros to help keep track of which context we are in. Also, add new lines to better visually separate the non-asm and asm sections. Signed-off-by: Vincent Mailhol Reviewed-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/bits.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bits.h b/include/linux/bits.h index 14fd0ca9a6cd..e1e517769140 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -19,16 +19,21 @@ * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ #if !defined(__ASSEMBLY__) + #include #include + #define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h))) -#else + +#else /* defined(__ASSEMBLY__) */ + /* * BUILD_BUG_ON_ZERO is not available in h files included from asm files, * disable the input check if that is the case. */ #define GENMASK_INPUT_CHECK(h, l) 0 -#endif + +#endif /* !defined(__ASSEMBLY__) */ #define GENMASK(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) -- cgit v1.2.3 From 19408200c094858d952a90bf4977733dc89a4df5 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Mar 2025 00:59:57 +0900 Subject: bits: introduce fixed-type GENMASK_U*() Add GENMASK_TYPE() which generalizes __GENMASK() to support different types, and implement fixed-types versions of GENMASK() based on it. The fixed-type version allows more strict checks to the min/max values accepted, which is useful for defining registers like implemented by i915 and xe drivers with their REG_GENMASK*() macros. The strict checks rely on shift-count-overflow compiler check to fail the build if a number outside of the range allowed is passed. Example: #define FOO_MASK GENMASK_U32(33, 4) will generate a warning like: include/linux/bits.h:51:27: error: right shift count >= width of type [-Werror=shift-count-overflow] 51 | type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h))))) | ^~ The result is casted to the corresponding fixed width type. For example, GENMASK_U8() returns an u8. Note that because of the C promotion rules, GENMASK_U8() and GENMASK_U16() will immediately be promoted to int if used in an expression. Regardless, the main goal is not to get the correct type, but rather to enforce more checks at compile time. While GENMASK_TYPE() is crafted to cover all variants, including the already existing GENMASK(), GENMASK_ULL() and GENMASK_U128(), for the moment, only use it for the newly introduced GENMASK_U*(). The consolidation will be done in a separate change. Co-developed-by: Yury Norov Signed-off-by: Lucas De Marchi Acked-by: Jani Nikula Signed-off-by: Vincent Mailhol Reviewed-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/bitops.h | 1 - include/linux/bits.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index c1cb53cf2f0f..9be2d50da09a 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -8,7 +8,6 @@ #include -#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) #define BITS_TO_U64(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) diff --git a/include/linux/bits.h b/include/linux/bits.h index e1e517769140..9718c5ae5fc3 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -12,6 +12,7 @@ #define BIT_ULL_MASK(nr) (ULL(1) << ((nr) % BITS_PER_LONG_LONG)) #define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) #define BITS_PER_BYTE 8 +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) /* * Create a contiguous bitmask starting at bit position @l and ending at @@ -20,11 +21,40 @@ */ #if !defined(__ASSEMBLY__) +/* + * Missing asm support + * + * GENMASK_U*() depend on BITS_PER_TYPE() which relies on sizeof(), + * something not available in asm. Nevertheless, fixed width integers is a C + * concept. Assembly code can rely on the long and long long versions instead. + */ + #include #include +#include #define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h))) +/* + * Generate a mask for the specified type @t. Additional checks are made to + * guarantee the value returned fits in that type, relying on + * -Wshift-count-overflow compiler check to detect incompatible arguments. + * For example, all these create build errors or warnings: + * + * - GENMASK(15, 20): wrong argument order + * - GENMASK(72, 15): doesn't fit unsigned long + * - GENMASK_U32(33, 15): doesn't fit in a u32 + */ +#define GENMASK_TYPE(t, h, l) \ + ((t)(GENMASK_INPUT_CHECK(h, l) + \ + (type_max(t) << (l) & \ + type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h))))) + +#define GENMASK_U8(h, l) GENMASK_TYPE(u8, h, l) +#define GENMASK_U16(h, l) GENMASK_TYPE(u16, h, l) +#define GENMASK_U32(h, l) GENMASK_TYPE(u32, h, l) +#define GENMASK_U64(h, l) GENMASK_TYPE(u64, h, l) + #else /* defined(__ASSEMBLY__) */ /* -- cgit v1.2.3 From 5b572e8a9f3dcd6e3ba80ec5b5ec85472c88cb4c Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 26 Mar 2025 00:59:58 +0900 Subject: bits: introduce fixed-type BIT_U*() Implement fixed-type BIT_U*() to help drivers add stricter checks, like it was done for GENMASK_U*(). Signed-off-by: Lucas De Marchi Acked-by: Jani Nikula Co-developed-by: Vincent Mailhol Signed-off-by: Vincent Mailhol Reviewed-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/bits.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bits.h b/include/linux/bits.h index 9718c5ae5fc3..7ad056219115 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -24,7 +24,7 @@ /* * Missing asm support * - * GENMASK_U*() depend on BITS_PER_TYPE() which relies on sizeof(), + * GENMASK_U*() and BIT_U*() depend on BITS_PER_TYPE() which relies on sizeof(), * something not available in asm. Nevertheless, fixed width integers is a C * concept. Assembly code can rely on the long and long long versions instead. */ @@ -55,6 +55,24 @@ #define GENMASK_U32(h, l) GENMASK_TYPE(u32, h, l) #define GENMASK_U64(h, l) GENMASK_TYPE(u64, h, l) +/* + * Fixed-type variants of BIT(), with additional checks like GENMASK_TYPE(). The + * following examples generate compiler warnings due to -Wshift-count-overflow: + * + * - BIT_U8(8) + * - BIT_U32(-1) + * - BIT_U32(40) + */ +#define BIT_INPUT_CHECK(type, nr) \ + BUILD_BUG_ON_ZERO(const_true((nr) >= BITS_PER_TYPE(type))) + +#define BIT_TYPE(type, nr) ((type)(BIT_INPUT_CHECK(type, nr) + BIT_ULL(nr))) + +#define BIT_U8(nr) BIT_TYPE(u8, nr) +#define BIT_U16(nr) BIT_TYPE(u16, nr) +#define BIT_U32(nr) BIT_TYPE(u32, nr) +#define BIT_U64(nr) BIT_TYPE(u64, nr) + #else /* defined(__ASSEMBLY__) */ /* -- cgit v1.2.3 From 243c90e917f5cfc99821e5104d1c8a81c11cda4c Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 29 Mar 2025 01:48:50 +0900 Subject: build_bug.h: more user friendly error messages in BUILD_BUG_ON_ZERO() __BUILD_BUG_ON_ZERO_MSG(), as introduced in [1], makes it possible to do a static assertions in expressions. The direct benefit is to provide a meaningful error message instead of the cryptic negative bitfield size error message currently returned by BUILD_BUG_ON_ZERO(): ./include/linux/build_bug.h:16:51: error: negative width in bit-field '' 16 | #define BUILD_BUG_ON_ZERO(e) ((int)(sizeof(struct { int:(-!!(e)); }))) | ^ Get rid of BUILD_BUG_ON_ZERO()'s bitfield size hack. Instead rely on __BUILD_BUG_ON_ZERO_MSG() which in turn relies on C11's _Static_assert(). Use some macro magic, similarly to static_assert(), to either use an optional error message provided by the user or, when omitted, to produce a default error message by stringifying the tested expression. With this, for example: BUILD_BUG_ON_ZERO(1 > 0) would now throw: ./include/linux/compiler.h:197:62: error: static assertion failed: "1 > 0 is true" 197 | define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) | ^~~~~~~~~~~~~~ Finally, __BUILD_BUG_ON_ZERO_MSG() is already guarded by an: #ifdef __CHECKER__ So no need any more for that guard clause for BUILD_BUG_ON_ZERO(). Remove it. [1] commit d7a516c6eeae ("compiler.h: Fix undefined BUILD_BUG_ON_ZERO()") Link: https://git.kernel.org/torvalds/c/d7a516c6eeae Signed-off-by: Vincent Mailhol Link: https://git.kernel.org/next/linux-next/c/b88937277df Reviewed-by: Kees Cook Signed-off-by: Yury Norov --- include/linux/build_bug.h | 10 +++++----- include/linux/compiler.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h index 3aa3640f8c18..2cfbb4c65c78 100644 --- a/include/linux/build_bug.h +++ b/include/linux/build_bug.h @@ -4,17 +4,17 @@ #include -#ifdef __CHECKER__ -#define BUILD_BUG_ON_ZERO(e) (0) -#else /* __CHECKER__ */ /* * Force a compilation error if condition is true, but also produce a * result (of value 0 and type int), so the expression can be used * e.g. in a structure initializer (or where-ever else comma expressions * aren't permitted). + * + * Take an error message as an optional second argument. If omitted, + * default to the stringification of the tested expression. */ -#define BUILD_BUG_ON_ZERO(e) ((int)(sizeof(struct { int:(-!!(e)); }))) -#endif /* __CHECKER__ */ +#define BUILD_BUG_ON_ZERO(e, ...) \ + __BUILD_BUG_ON_ZERO_MSG(e, ##__VA_ARGS__, #e " is true") /* Force a compilation error if a constant expression is not a power of 2 */ #define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 27725f1ab5ab..6f04a1d8c720 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -192,9 +192,9 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, }) #ifdef __CHECKER__ -#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) +#define __BUILD_BUG_ON_ZERO_MSG(e, msg, ...) (0) #else /* __CHECKER__ */ -#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) +#define __BUILD_BUG_ON_ZERO_MSG(e, msg, ...) ((int)sizeof(struct {_Static_assert(!(e), msg);})) #endif /* __CHECKER__ */ /* &a[0] degrades to a pointer: a different type from an array */ -- cgit v1.2.3 From 99c712d788c46452289beada638476b68b5a773b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 17 Apr 2025 19:17:15 +0300 Subject: bitmap-str: Get rid of 'extern' for function prototypes The bitmap-str.h uses mixed style for function prototypes. Drop the 'extern' as it easier to read and makes style aligned with a new code in the kernel. Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/bitmap-str.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap-str.h b/include/linux/bitmap-str.h index 17caeca94cab..d758b4809a3a 100644 --- a/include/linux/bitmap-str.h +++ b/include/linux/bitmap-str.h @@ -4,10 +4,10 @@ int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); -extern int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp, - int nmaskbits, loff_t off, size_t count); -extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, - int nmaskbits, loff_t off, size_t count); +int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, + loff_t off, size_t count); +int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, + loff_t off, size_t count); int bitmap_parse(const char *buf, unsigned int buflen, unsigned long *dst, int nbits); int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits); int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, -- cgit v1.2.3 From 89a44a808814d4717a8bf945ddebd5c39bfffcf4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 17 Apr 2025 19:17:16 +0300 Subject: bitmap-str: Add missing header(s) bitmap-str.h is not self-contained, it uses bool type that is provided in types.h and it uses __user annotation that is guaranteed to be included with types.h. Add missing header(s) to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/bitmap-str.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap-str.h b/include/linux/bitmap-str.h index d758b4809a3a..53d3e1b32d3d 100644 --- a/include/linux/bitmap-str.h +++ b/include/linux/bitmap-str.h @@ -2,6 +2,8 @@ #ifndef __LINUX_BITMAP_STR_H #define __LINUX_BITMAP_STR_H +#include + int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, -- cgit v1.2.3 From a256ae22570ee4c3427fdc703a58a89afee6a332 Mon Sep 17 00:00:00 2001 From: Luo Jie Date: Thu, 17 Apr 2025 18:47:08 +0800 Subject: bitfield: Add FIELD_MODIFY() helper Add a helper for replacing the contents of bitfield in memory with the specified value. Even though a helper xxx_replace_bits() is available, it is not well documented, and only reports errors at the run time, which will not be helpful to catch possible overflow errors due to incorrect parameter types used. FIELD_MODIFY(REG_FIELD_C, ®, c) is the wrapper to the code below. reg &= ~REG_FIELD_C; reg |= FIELD_PREP(REG_FIELD_C, c); Yury: trim commit message, align backslashes. Signed-off-by: Luo Jie Signed-off-by: Yury Norov --- include/linux/bitfield.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 63928f173223..6d9a53db54b6 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -8,6 +8,7 @@ #define _LINUX_BITFIELD_H #include +#include #include /* @@ -38,8 +39,7 @@ * FIELD_PREP(REG_FIELD_D, 0x40); * * Modify: - * reg &= ~REG_FIELD_C; - * reg |= FIELD_PREP(REG_FIELD_C, c); + * FIELD_MODIFY(REG_FIELD_C, ®, c); */ #define __bf_shf(x) (__builtin_ffsll(x) - 1) @@ -156,6 +156,23 @@ (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ }) +/** + * FIELD_MODIFY() - modify a bitfield element + * @_mask: shifted mask defining the field's length and position + * @_reg_p: pointer to the memory that should be updated + * @_val: value to store in the bitfield + * + * FIELD_MODIFY() modifies the set of bits in @_reg_p specified by @_mask, + * by replacing them with the bitfield value passed in as @_val. + */ +#define FIELD_MODIFY(_mask, _reg_p, _val) \ + ({ \ + typecheck_pointer(_reg_p); \ + __BF_FIELD_CHECK(_mask, *(_reg_p), _val, "FIELD_MODIFY: "); \ + *(_reg_p) &= ~(_mask); \ + *(_reg_p) |= (((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask)); \ + }) + extern void __compiletime_error("value doesn't fit into mask") __field_overflow(void); extern void __compiletime_error("bad bitfield mask") -- cgit v1.2.3 From 007c07168ac0c64387be500f6604b09ace3f3bdc Mon Sep 17 00:00:00 2001 From: Su Hui Date: Wed, 30 Apr 2025 11:27:32 +0800 Subject: time/jiffies: Change register_refined_jiffies() to void __init register_refined_jiffies() is only used in setup code and always returns 0. Mark it as __init to save some bytes and change it to void. Signed-off-by: Su Hui Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250430032734.2079290-2-suhui@nfschina.com --- include/linux/jiffies.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 0ea8c9887429..91b20788273d 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -59,7 +59,7 @@ /* LATCH is used in the interval timer and ftape setup. */ #define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ -extern int register_refined_jiffies(long clock_tick_rate); +extern void register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) -- cgit v1.2.3 From 6c9ffa2ae48e8fd44ba5c6ffdc16e62a62bac38a Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Fri, 25 Apr 2025 13:18:18 +0200 Subject: power: supply: add inhibit-charge-awake to charge_behaviour MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OneXPlayer devices have a charge inhibit feature that allows the user to select between it being active always or only when the device is on. Therefore, add attribute inhibit-charge-awake to charge_behaviour to allow the user to select that charge should be paused only when the device is awake. Reviewed-by: Hans de Goede Reviewed-by: Thomas Weißschuh Reviewed-by: Derek J. Clark Signed-off-by: Antheas Kapenekakis Link: https://lore.kernel.org/r/20250425111821.88746-14-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 888824592953..cbec930430a7 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -212,6 +212,7 @@ enum power_supply_usb_type { enum power_supply_charge_behaviour { POWER_SUPPLY_CHARGE_BEHAVIOUR_AUTO = 0, POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE, + POWER_SUPPLY_CHARGE_BEHAVIOUR_INHIBIT_CHARGE_AWAKE, POWER_SUPPLY_CHARGE_BEHAVIOUR_FORCE_DISCHARGE, }; -- cgit v1.2.3 From 144530c15ec7fa95b29812d86f4be527338ea204 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:46:16 -0700 Subject: pds_core: remove extra name description Fix the kernel-doc complaint include/linux/pds/pds_adminq.h:481: warning: Excess struct member 'name' description in 'pds_core_lif_getattr_comp' Reviewed-by: Simon Horman Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- include/linux/pds/pds_adminq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index ddd111f04ca0..339156113fa5 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -463,7 +463,6 @@ struct pds_core_lif_getattr_cmd { * @rsvd: Word boundary padding * @comp_index: Index in the descriptor ring for which this is the completion * @state: LIF state (enum pds_core_lif_state) - * @name: LIF name string, 0 terminated * @features: Features (enum pds_core_hw_features) * @rsvd2: Word boundary padding * @color: Color bit -- cgit v1.2.3 From 7c4f4c4fa9b6fbb7e483bebd02f7b9cbc20ca5cc Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:46:17 -0700 Subject: pds_core: smaller adminq poll starting interval Shorten the adminq poll starting interval in order to notice any transaction errors more quickly. Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/pds/pds_adminq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index 339156113fa5..40ff0ec2b879 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -4,7 +4,7 @@ #ifndef _PDS_CORE_ADMINQ_H_ #define _PDS_CORE_ADMINQ_H_ -#define PDSC_ADMINQ_MAX_POLL_INTERVAL 256 +#define PDSC_ADMINQ_MAX_POLL_INTERVAL 256000 /* usecs */ enum pds_core_adminq_flags { PDS_AQ_FLAG_FASTPOLL = BIT(1), /* completion poll at 1ms */ -- cgit v1.2.3 From fc7fed6f77f94f2fd9a7557020503e146eb0ce38 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 25 Mar 2025 11:58:46 +0000 Subject: coresight: Convert tag clear function to take a struct csdev_access The self hosted claim tag will be reset on device probe in a later commit. We'll want to do this before coresight_register() is called so won't have a coresight_device and have to use csdev_access instead. Also make them public and create locked and unlocked versions for later use. These look functions look like they set the whole tags register as one value, but they only set and clear the self hosted bit using a SET/CLR bits mechanism so also rename the functions to reflect this better. Reviewed-by: Leo Yan Reviewed-by: Yeoreum Yun Signed-off-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250325-james-coresight-claim-tags-v4-1-dfbd3822b2e5@linaro.org --- include/linux/coresight.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index cfcf6e4707ed..b89692d9ceac 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -685,7 +685,8 @@ extern int coresight_timeout_action(struct csdev_access *csa, u32 offset, extern int coresight_claim_device(struct coresight_device *csdev); extern int coresight_claim_device_unlocked(struct coresight_device *csdev); - +void coresight_clear_self_claim_tag(struct csdev_access *csa); +void coresight_clear_self_claim_tag_unlocked(struct csdev_access *csa); extern void coresight_disclaim_device(struct coresight_device *csdev); extern void coresight_disclaim_device_unlocked(struct coresight_device *csdev); extern char *coresight_alloc_device_name(struct coresight_dev_list *devs, -- cgit v1.2.3 From e6e6b692865d333d79d25c761c53b19e73e9653f Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 25 Mar 2025 11:58:52 +0000 Subject: coresight: Remove extern from function declarations Function declarations are extern by default so remove the extra noise and inconsistency. Reviewed-by: Leo Yan Reviewed-by: Yeoreum Yun Signed-off-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250325-james-coresight-claim-tags-v4-7-dfbd3822b2e5@linaro.org --- include/linux/coresight.h | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index b89692d9ceac..8abdd8b5c791 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -671,28 +671,27 @@ static inline void coresight_set_mode(struct coresight_device *csdev, local_set(&csdev->mode, new_mode); } -extern struct coresight_device * -coresight_register(struct coresight_desc *desc); -extern void coresight_unregister(struct coresight_device *csdev); -extern int coresight_enable_sysfs(struct coresight_device *csdev); -extern void coresight_disable_sysfs(struct coresight_device *csdev); -extern int coresight_timeout(struct csdev_access *csa, u32 offset, - int position, int value); +struct coresight_device *coresight_register(struct coresight_desc *desc); +void coresight_unregister(struct coresight_device *csdev); +int coresight_enable_sysfs(struct coresight_device *csdev); +void coresight_disable_sysfs(struct coresight_device *csdev); +int coresight_timeout(struct csdev_access *csa, u32 offset, int position, int value); typedef void (*coresight_timeout_cb_t) (struct csdev_access *, u32, int, int); -extern int coresight_timeout_action(struct csdev_access *csa, u32 offset, - int position, int value, - coresight_timeout_cb_t cb); +int coresight_timeout_action(struct csdev_access *csa, u32 offset, int position, int value, + coresight_timeout_cb_t cb); +int coresight_claim_device(struct coresight_device *csdev); +int coresight_claim_device_unlocked(struct coresight_device *csdev); -extern int coresight_claim_device(struct coresight_device *csdev); -extern int coresight_claim_device_unlocked(struct coresight_device *csdev); +int coresight_claim_device(struct coresight_device *csdev); +int coresight_claim_device_unlocked(struct coresight_device *csdev); void coresight_clear_self_claim_tag(struct csdev_access *csa); void coresight_clear_self_claim_tag_unlocked(struct csdev_access *csa); -extern void coresight_disclaim_device(struct coresight_device *csdev); -extern void coresight_disclaim_device_unlocked(struct coresight_device *csdev); -extern char *coresight_alloc_device_name(struct coresight_dev_list *devs, +void coresight_disclaim_device(struct coresight_device *csdev); +void coresight_disclaim_device_unlocked(struct coresight_device *csdev); +char *coresight_alloc_device_name(struct coresight_dev_list *devs, struct device *dev); -extern bool coresight_loses_context_with_cpu(struct device *dev); +bool coresight_loses_context_with_cpu(struct device *dev); u32 coresight_relaxed_read32(struct coresight_device *csdev, u32 offset); u32 coresight_read32(struct coresight_device *csdev, u32 offset); @@ -705,8 +704,8 @@ void coresight_relaxed_write64(struct coresight_device *csdev, u64 val, u32 offset); void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset); -extern int coresight_get_cpu(struct device *dev); -extern int coresight_get_static_trace_id(struct device *dev, u32 *id); +int coresight_get_cpu(struct device *dev); +int coresight_get_static_trace_id(struct device *dev, u32 *id); struct coresight_platform_data *coresight_get_platform_data(struct device *dev); struct coresight_connection * -- cgit v1.2.3 From 468d8b462ac64659caec53eff34f02963d5f52c8 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:45 -0500 Subject: iidc/ice/irdma: Rename IDC header file To prepare for the IDC upgrade to support different CORE PCI drivers, rename header file from iidc.h to iidc_rdma.h since this files functionality is specifically for RDMA support. Use net/dscp.h include in irdma osdep.h and DSCP_MAX type.h, instead of iidc header and define. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc.h | 109 ------------------------------------ include/linux/net/intel/iidc_rdma.h | 109 ++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 109 deletions(-) delete mode 100644 include/linux/net/intel/iidc.h create mode 100644 include/linux/net/intel/iidc_rdma.h (limited to 'include/linux') diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h deleted file mode 100644 index 13274c3def66..000000000000 --- a/include/linux/net/intel/iidc.h +++ /dev/null @@ -1,109 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2021, Intel Corporation. */ - -#ifndef _IIDC_H_ -#define _IIDC_H_ - -#include -#include -#include -#include -#include -#include - -enum iidc_event_type { - IIDC_EVENT_BEFORE_MTU_CHANGE, - IIDC_EVENT_AFTER_MTU_CHANGE, - IIDC_EVENT_BEFORE_TC_CHANGE, - IIDC_EVENT_AFTER_TC_CHANGE, - IIDC_EVENT_CRIT_ERR, - IIDC_EVENT_NBITS /* must be last */ -}; - -enum iidc_reset_type { - IIDC_PFR, - IIDC_CORER, - IIDC_GLOBR, -}; - -enum iidc_rdma_protocol { - IIDC_RDMA_PROTOCOL_IWARP = BIT(0), - IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), -}; - -#define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_MAX_DSCP_MAPPING 64 -#define IIDC_DSCP_PFC_MODE 0x1 - -/* Struct to hold per RDMA Qset info */ -struct iidc_rdma_qset_params { - /* Qset TEID returned to the RDMA driver in - * ice_add_rdma_qset and used by RDMA driver - * for calls to ice_del_rdma_qset - */ - u32 teid; /* Qset TEID */ - u16 qs_handle; /* RDMA driver provides this */ - u16 vport_id; /* VSI index */ - u8 tc; /* TC branch the Qset should belong to */ -}; - -struct iidc_qos_info { - u64 tc_ctx; - u8 rel_bw; - u8 prio_type; - u8 egress_virt_up; - u8 ingress_virt_up; -}; - -/* Struct to pass QoS info */ -struct iidc_qos_params { - struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; - u8 up2tc[IIDC_MAX_USER_PRIORITY]; - u8 vport_relative_bw; - u8 vport_priority_type; - u8 num_tc; - u8 pfc_mode; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; -}; - -struct iidc_event { - DECLARE_BITMAP(type, IIDC_EVENT_NBITS); - u32 reg; -}; - -struct ice_pf; - -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); - -/* Structure representing auxiliary driver tailored information about the core - * PCI dev, each auxiliary driver using the IIDC interface will have an - * instance of this struct dedicated to it. - */ - -struct iidc_auxiliary_dev { - struct auxiliary_device adev; - struct ice_pf *pf; -}; - -/* structure representing the auxiliary driver. This struct is to be - * allocated and populated by the auxiliary driver's owner. The core PCI - * driver will access these ops by performing a container_of on the - * auxiliary_device->dev.driver. - */ -struct iidc_auxiliary_drv { - struct auxiliary_driver adrv; - /* This event_handler is meant to be a blocking call. For instance, - * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not - * return until the auxiliary driver is ready for the MTU change to - * happen. - */ - void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); -}; - -#endif /* _IIDC_H_*/ diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h new file mode 100644 index 000000000000..0cd75404e459 --- /dev/null +++ b/include/linux/net/intel/iidc_rdma.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021, Intel Corporation. */ + +#ifndef _IIDC_RDMA_H_ +#define _IIDC_RDMA_H_ + +#include +#include +#include +#include +#include +#include + +enum iidc_event_type { + IIDC_EVENT_BEFORE_MTU_CHANGE, + IIDC_EVENT_AFTER_MTU_CHANGE, + IIDC_EVENT_BEFORE_TC_CHANGE, + IIDC_EVENT_AFTER_TC_CHANGE, + IIDC_EVENT_CRIT_ERR, + IIDC_EVENT_NBITS /* must be last */ +}; + +enum iidc_reset_type { + IIDC_PFR, + IIDC_CORER, + IIDC_GLOBR, +}; + +enum iidc_rdma_protocol { + IIDC_RDMA_PROTOCOL_IWARP = BIT(0), + IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), +}; + +#define IIDC_MAX_USER_PRIORITY 8 +#define IIDC_MAX_DSCP_MAPPING 64 +#define IIDC_DSCP_PFC_MODE 0x1 + +/* Struct to hold per RDMA Qset info */ +struct iidc_rdma_qset_params { + /* Qset TEID returned to the RDMA driver in + * ice_add_rdma_qset and used by RDMA driver + * for calls to ice_del_rdma_qset + */ + u32 teid; /* Qset TEID */ + u16 qs_handle; /* RDMA driver provides this */ + u16 vport_id; /* VSI index */ + u8 tc; /* TC branch the Qset should belong to */ +}; + +struct iidc_qos_info { + u64 tc_ctx; + u8 rel_bw; + u8 prio_type; + u8 egress_virt_up; + u8 ingress_virt_up; +}; + +/* Struct to pass QoS info */ +struct iidc_qos_params { + struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; + u8 up2tc[IIDC_MAX_USER_PRIORITY]; + u8 vport_relative_bw; + u8 vport_priority_type; + u8 num_tc; + u8 pfc_mode; + u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; +}; + +struct iidc_event { + DECLARE_BITMAP(type, IIDC_EVENT_NBITS); + u32 reg; +}; + +struct ice_pf; + +int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); +int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); +void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); +int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); + +/* Structure representing auxiliary driver tailored information about the core + * PCI dev, each auxiliary driver using the IIDC interface will have an + * instance of this struct dedicated to it. + */ + +struct iidc_auxiliary_dev { + struct auxiliary_device adev; + struct ice_pf *pf; +}; + +/* structure representing the auxiliary driver. This struct is to be + * allocated and populated by the auxiliary driver's owner. The core PCI + * driver will access these ops by performing a container_of on the + * auxiliary_device->dev.driver. + */ +struct iidc_auxiliary_drv { + struct auxiliary_driver adrv; + /* This event_handler is meant to be a blocking call. For instance, + * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not + * return until the auxiliary driver is ready for the MTU change to + * happen. + */ + void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); +}; + +#endif /* _IIDC_RDMA_H_*/ -- cgit v1.2.3 From 118c40b7b50340bf7ff7e0adee8e3bab6e552c82 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 28 Mar 2025 18:21:44 +0100 Subject: kbuild: require gcc-8 and binutils-2.30 Commit a3e8fe814ad1 ("x86/build: Raise the minimum GCC version to 8.1") raised the minimum compiler version as enforced by Kbuild to gcc-8.1 and clang-15 for x86. This is actually the same gcc version that has been discussed as the minimum for all architectures several times in the past, with little objection. A previous concern was the kernel for SLE15-SP7 needing to be built with gcc-7. As this ended up still using linux-6.4 and there is no plan for an SP8, this is no longer a problem. Change it for all architectures and adjust the documentation accordingly. A few version checks can be removed in the process. The binutils version 2.30 is the lowest version used in combination with gcc-8 on common distros, so use that as the corresponding minimum. Link: https://lore.kernel.org/lkml/20240925150059.3955569-32-ardb+git@google.com/ Link: https://lore.kernel.org/lkml/871q7yxrgv.wl-tiwai@suse.de/ Acked-by: Mark Rutland Signed-off-by: Arnd Bergmann --- include/linux/unroll.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/unroll.h b/include/linux/unroll.h index 863fb69f6a7e..186b71de740f 100644 --- a/include/linux/unroll.h +++ b/include/linux/unroll.h @@ -11,10 +11,8 @@ #ifdef CONFIG_CC_IS_CLANG #define __pick_unrolled(x, y) _Pragma(#x) -#elif CONFIG_GCC_VERSION >= 80000 -#define __pick_unrolled(x, y) _Pragma(#y) #else -#define __pick_unrolled(x, y) /* not supported */ +#define __pick_unrolled(x, y) _Pragma(#y) #endif /** -- cgit v1.2.3 From 97b5631aae6896369712d6b7131afbc95c753587 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:46 -0500 Subject: iidc/ice/irdma: Rename to iidc_* convention In preparation of supporting more than a single core PCI driver for RDMA, homogenize naming to iidc_rdma_* and IIDC_RDMA_* form. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc_rdma.h | 38 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 0cd75404e459..2b24a9912fa0 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -11,16 +11,16 @@ #include #include -enum iidc_event_type { - IIDC_EVENT_BEFORE_MTU_CHANGE, - IIDC_EVENT_AFTER_MTU_CHANGE, - IIDC_EVENT_BEFORE_TC_CHANGE, - IIDC_EVENT_AFTER_TC_CHANGE, - IIDC_EVENT_CRIT_ERR, - IIDC_EVENT_NBITS /* must be last */ +enum iidc_rdma_event_type { + IIDC_RDMA_EVENT_BEFORE_MTU_CHANGE, + IIDC_RDMA_EVENT_AFTER_MTU_CHANGE, + IIDC_RDMA_EVENT_BEFORE_TC_CHANGE, + IIDC_RDMA_EVENT_AFTER_TC_CHANGE, + IIDC_RDMA_EVENT_CRIT_ERR, + IIDC_RDMA_EVENT_NBITS /* must be last */ }; -enum iidc_reset_type { +enum iidc_rdma_reset_type { IIDC_PFR, IIDC_CORER, IIDC_GLOBR, @@ -47,7 +47,7 @@ struct iidc_rdma_qset_params { u8 tc; /* TC branch the Qset should belong to */ }; -struct iidc_qos_info { +struct iidc_rdma_qos_info { u64 tc_ctx; u8 rel_bw; u8 prio_type; @@ -56,8 +56,8 @@ struct iidc_qos_info { }; /* Struct to pass QoS info */ -struct iidc_qos_params { - struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; +struct iidc_rdma_qos_params { + struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; u8 up2tc[IIDC_MAX_USER_PRIORITY]; u8 vport_relative_bw; u8 vport_priority_type; @@ -66,8 +66,8 @@ struct iidc_qos_params { u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; }; -struct iidc_event { - DECLARE_BITMAP(type, IIDC_EVENT_NBITS); +struct iidc_rdma_event { + DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); u32 reg; }; @@ -75,9 +75,11 @@ struct ice_pf; int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); +int ice_rdma_request_reset(struct ice_pf *pf, + enum iidc_rdma_reset_type reset_type); int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); +void ice_get_qos_params(struct ice_pf *pf, + struct iidc_rdma_qos_params *qos); int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); @@ -86,7 +88,7 @@ void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); * instance of this struct dedicated to it. */ -struct iidc_auxiliary_dev { +struct iidc_rdma_core_auxiliary_dev { struct auxiliary_device adev; struct ice_pf *pf; }; @@ -96,14 +98,14 @@ struct iidc_auxiliary_dev { * driver will access these ops by performing a container_of on the * auxiliary_device->dev.driver. */ -struct iidc_auxiliary_drv { +struct iidc_rdma_core_auxiliary_drv { struct auxiliary_driver adrv; /* This event_handler is meant to be a blocking call. For instance, * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not * return until the auxiliary driver is ready for the MTU change to * happen. */ - void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); + void (*event_handler)(struct ice_pf *pf, struct iidc_rdma_event *event); }; #endif /* _IIDC_RDMA_H_*/ -- cgit v1.2.3 From d9251a560ba67bbedd53b81aee32e1ad95f42000 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:47 -0500 Subject: iidc/ice/irdma: Break iidc.h into two headers In preparation of supporting more than a single core PCI driver for RDMA, break the iidc_rdma.h header file into two more focused headers. Only the elements universal to all Intel drivers will remain in the generic iidc_rdma.h header. Move the ice specific information to an ice specific header file named iidc_rdma_ice.h. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc_rdma.h | 14 +------------- include/linux/net/intel/iidc_rdma_ice.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 13 deletions(-) create mode 100644 include/linux/net/intel/iidc_rdma_ice.h (limited to 'include/linux') diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 2b24a9912fa0..1e8136395154 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2021, Intel Corporation. */ +/* Copyright (C) 2021-2025, Intel Corporation. */ #ifndef _IIDC_RDMA_H_ #define _IIDC_RDMA_H_ @@ -71,18 +71,6 @@ struct iidc_rdma_event { u32 reg; }; -struct ice_pf; - -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, - enum iidc_rdma_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, - struct iidc_rdma_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); - /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an * instance of this struct dedicated to it. diff --git a/include/linux/net/intel/iidc_rdma_ice.h b/include/linux/net/intel/iidc_rdma_ice.h new file mode 100644 index 000000000000..78d10003d776 --- /dev/null +++ b/include/linux/net/intel/iidc_rdma_ice.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021-2025, Intel Corporation. */ + +#ifndef _IIDC_RDMA_ICE_H_ +#define _IIDC_RDMA_ICE_H_ + +struct ice_pf; + +int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct ice_pf *pf, + enum iidc_rdma_reset_type reset_type); +int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); +void ice_get_qos_params(struct ice_pf *pf, + struct iidc_rdma_qos_params *qos); +int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); + +#endif /* _IIDC_RDMA_ICE_H_*/ -- cgit v1.2.3 From 8239b771b94b639556c1987185fd82b2a896c923 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Tue, 15 Apr 2025 21:15:48 -0500 Subject: ice: Replace ice specific DSCP mapping num with a kernel define Replace ice driver specific DSCP mapping number defines ICE_DSCP_NUM_VAL and IIDC_MAX_DSCP_MAPPING with an equivalent kernel define DSCP_MAX. Reviewed-by: Przemek Kitszel Signed-off-by: Tatyana Nikolova Signed-off-by: Dave Ertman Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc_rdma.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 1e8136395154..7f1910289534 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -10,6 +10,7 @@ #include #include #include +#include enum iidc_rdma_event_type { IIDC_RDMA_EVENT_BEFORE_MTU_CHANGE, @@ -32,7 +33,6 @@ enum iidc_rdma_protocol { }; #define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_MAX_DSCP_MAPPING 64 #define IIDC_DSCP_PFC_MODE 0x1 /* Struct to hold per RDMA Qset info */ @@ -63,7 +63,7 @@ struct iidc_rdma_qos_params { u8 vport_priority_type; u8 num_tc; u8 pfc_mode; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + u8 dscp_map[DSCP_MAX]; }; struct iidc_rdma_event { -- cgit v1.2.3 From 1b3f2bd04d90f61e1f291b5e365b9bc4ce0ea7c7 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 29 Apr 2025 19:46:21 -0700 Subject: x86/devmem: Remove duplicate range_is_allowed() definition 17 years ago, Venki suggested [1] "A future improvement would be to avoid the range_is_allowed duplication". The only thing preventing a common implementation is that phys_mem_access_prot_allowed() expects the range check to exit immediately when PAT is disabled [2]. I.e. there is no cache conflict to manage in that case. This cleanup was noticed on the path to considering changing range_is_allowed() policy to blanket deny /dev/mem for private (confidential computing) memory. Note, however that phys_mem_access_prot_allowed() has long since stopped being relevant for managing cache-type validation due to [3], and [4]. Commit 0124cecfc85a ("x86, PAT: disable /dev/mem mmap RAM with PAT") [1] Commit 9e41bff2708e ("x86: fix /dev/mem mmap breakage when PAT is disabled") [2] Commit 1886297ce0c8 ("x86/mm/pat: Fix BUG_ON() in mmap_mem() on QEMU/i386") [3] Commit 0c3c8a18361a ("x86, PAT: Remove duplicate memtype reserve in devmem mmap") [4] Signed-off-by: Dan Williams Signed-off-by: Dave Hansen Reviewed-by: Nikolay Borisov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/all/20250430024622.1134277-2-dan.j.williams%40intel.com --- include/linux/io.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index 6a6bc4d46d0a..0642c7ee41db 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -183,4 +183,25 @@ static inline void arch_io_free_memtype_wc(resource_size_t base, int devm_arch_io_reserve_memtype_wc(struct device *dev, resource_size_t start, resource_size_t size); +#ifdef CONFIG_STRICT_DEVMEM +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + u64 from = ((u64)pfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + while (cursor < to) { + if (!devmem_is_allowed(pfn)) + return 0; + cursor += PAGE_SIZE; + pfn++; + } + return 1; +} +#else +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + return 1; +} +#endif #endif /* _LINUX_IO_H */ -- cgit v1.2.3 From a3e1c0ad835702555d90565584ab6f723adf7f94 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 29 Apr 2025 08:04:46 +0200 Subject: net: phy: factor out provider part from mdio_bus.c After 52358dd63e34 ("net: phy: remove function stubs") there's a problem if CONFIG_MDIO_BUS is set, but CONFIG_PHYLIB is not. mdiobus_scan() uses phylib functions like get_phy_device(). Bringing back the stub wouldn't make much sense, because it would allow to compile mdiobus_scan(), but the function would be unusable. The stub returned NULL, and we have the following in mdiobus_scan(): phydev = get_phy_device(bus, addr, c45); if (IS_ERR(phydev)) return phydev; So calling mdiobus_scan() w/o CONFIG_PHYLIB would cause a crash later in mdiobus_scan(). In general the PHYLIB functionality isn't optional here. Consequently, MDIO bus providers depend on PHYLIB. Therefore factor it out and build it together with the libphy core modules. In addition make all MDIO bus providers under /drivers/net/mdio depend on PHYLIB. Same applies to enetc MDIO bus provider. Note that PHYLIB selects MDIO_DEVRES, therefore we can omit this here. Fixes: 52358dd63e34 ("net: phy: remove function stubs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504270639.mT0lh2o1-lkp@intel.com/ Reviewed-by: Jacob Keller Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/c74772a9-dab6-44bf-a657-389df89d85c2@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 3beaf225ee88..d62d292024bc 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2062,6 +2062,7 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct netlink_ext_ack *extack); extern const struct bus_type mdio_bus_type; +extern const struct class mdio_bus_class; struct mdio_board_info { const char *bus_id; -- cgit v1.2.3 From 8c5d8c0b9e8184a86baede72ca392f90d867d22e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 23 Apr 2025 14:21:28 +0530 Subject: OPP: Define and use scope-based cleanup helpers Define and use scope-based cleanup helpers for `struct opp` and `struct opp_table`. No intentional functional impact. Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 0deddfa91aca..8313ed981535 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -11,6 +11,7 @@ #ifndef __LINUX_OPP_H__ #define __LINUX_OPP_H__ +#include #include #include #include @@ -579,6 +580,12 @@ static inline int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_ta } #endif +/* Scope based cleanup macro for OPP reference counting */ +DEFINE_FREE(put_opp, struct dev_pm_opp *, if (!IS_ERR_OR_NULL(_T)) dev_pm_opp_put(_T)) + +/* Scope based cleanup macro for OPP table reference counting */ +DEFINE_FREE(put_opp_table, struct opp_table *, if (!IS_ERR_OR_NULL(_T)) dev_pm_opp_put_opp_table(_T)) + /* OPP Configuration helpers */ static inline int dev_pm_opp_add(struct device *dev, unsigned long freq, -- cgit v1.2.3 From ee3de3cf7035aaf289085111fd0cfaddc93f0409 Mon Sep 17 00:00:00 2001 From: Praveen Talari Date: Fri, 2 May 2025 10:58:22 +0530 Subject: OPP: Add dev_pm_opp_set_level() To configure a device to a specific performance level, consumer drivers currently need to determine the OPP based on the exact level and then set it, resulting in code duplication across drivers. The new helper API, dev_pm_opp_set_level(), addresses this issue by providing a streamlined method for consumer drivers to find and set the OPP based on the desired performance level, thereby eliminating redundancy. Signed-off-by: Praveen Talari [ Viresh: Lot of fixes in the code, and rebased over latest changes. Fixed commit log too. ] Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 8313ed981535..cf477beae4bb 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -197,6 +197,7 @@ int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) void dev_pm_opp_remove_table(struct device *dev); void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask); int dev_pm_opp_sync_regulators(struct device *dev); + #else static inline struct opp_table *dev_pm_opp_get_opp_table(struct device *dev) { @@ -717,4 +718,14 @@ static inline unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp) return dev_pm_opp_get_freq_indexed(opp, 0); } +static inline int dev_pm_opp_set_level(struct device *dev, unsigned int level) +{ + struct dev_pm_opp *opp __free(put_opp) = dev_pm_opp_find_level_exact(dev, level); + + if (IS_ERR(opp)) + return PTR_ERR(opp); + + return dev_pm_opp_set_opp(dev, opp); +} + #endif /* __LINUX_OPP_H__ */ -- cgit v1.2.3 From 2e9b2ee2ba403cbe270a8256b8794ef5ad19b38d Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 30 Apr 2025 10:52:49 +0800 Subject: iommu: Cleanup comments for dev_enable/disable_feat The dev_enable/disable_feat ops have been removed by commit ("iommu: Remove iommu_dev_enable/disable_feature()"). Cleanup the comments to make the code clean. Signed-off-by: Lu Baolu Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20250430025249.2371751-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index bc5363e6b43e..56b4a1c2e031 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -590,8 +590,6 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * @of_xlate: add OF master IDs to iommu grouping * @is_attach_deferred: Check if domain attach should be deferred from iommu * driver init to device driver init (default no) - * @dev_enable/disable_feat: per device entries to enable/disable - * iommu specific features. * @page_response: handle page request response * @def_domain_type: device default domain type, return value: * - IOMMU_DOMAIN_IDENTITY: must use an identity domain -- cgit v1.2.3 From b5325b2a270fcaf7b2a9a0f23d422ca8a5a8bdea Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Apr 2025 15:55:07 +0200 Subject: coredump: hand a pidfd to the usermode coredump helper Give userspace a way to instruct the kernel to install a pidfd into the usermode helper process. This makes coredump handling a lot more reliable for userspace. In parallel with this commit we already have systemd adding support for this in [1]. We create a pidfs file for the coredumping process when we process the corename pattern. When the usermode helper process is forked we then install the pidfs file as file descriptor three into the usermode helpers file descriptor table so it's available to the exec'd program. Since usermode helpers are either children of the system_unbound_wq workqueue or kthreadd we know that the file descriptor table is empty and can thus always use three as the file descriptor number. Note, that we'll install a pidfd for the thread-group leader even if a subthread is calling do_coredump(). We know that task linkage hasn't been removed due to delay_group_leader() and even if this @current isn't the actual thread-group leader we know that the thread-group leader cannot be reaped until @current has exited. Link: https://github.com/systemd/systemd/pull/37125 [1] Link: https://lore.kernel.org/20250414-work-coredump-v2-3-685bf231f828@kernel.org Tested-by: Luca Boccassi Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/linux/coredump.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 77e6e195d1d6..76e41805b92d 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -28,6 +28,7 @@ struct coredump_params { int vma_count; size_t vma_data_size; struct core_vma_metadata *vma_meta; + struct pid *pid; }; extern unsigned int core_file_note_size_limit; -- cgit v1.2.3 From 66d454e99d71857faf249486912e381ec83760b4 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 2 May 2025 09:15:21 -0700 Subject: bpf: udp: Make sure iter->batch always contains a full bucket snapshot Require that iter->batch always contains a full bucket snapshot. This invariant is important to avoid skipping or repeating sockets during iteration when combined with the next few patches. Before, there were two cases where a call to bpf_iter_udp_batch may only capture part of a bucket: 1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1]. 2. When more sockets are added to the bucket while calling bpf_iter_udp_realloc_batch(), making the updated batch size insufficient [2]. In cases where the batch size only covers part of a bucket, it is possible to forget which sockets were already visited, especially if we have to process a bucket in more than two batches. This forces us to choose between repeating or skipping sockets, so don't allow this: 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation fails instead of continuing with a partial batch. 2. Try bpf_iter_udp_realloc_batch() with GFP_USER just as before, but if we still aren't able to capture the full bucket, call bpf_iter_udp_realloc_batch() again while holding the bucket lock to guarantee the bucket does not change. On the second attempt use GFP_NOWAIT since we hold onto the spin lock. Introduce the udp_portaddr_for_each_entry_from macro and use it instead of udp_portaddr_for_each_entry to make it possible to continue iteration from an arbitrary socket. This is required for this patch in the GFP_NOWAIT case to allow us to fill the rest of a batch starting from the middle of a bucket and the later patch which skips sockets that were already seen. Testing all scenarios directly is a bit difficult, but I did some manual testing to exercise the code paths where GFP_NOWAIT is used and where ERR_PTR(err) is returned. I used the realloc test case included later in this series to trigger a scenario where a realloc happens inside bpf_iter_udp_batch and made a small code tweak to force the first realloc attempt to allocate a too-small batch, thus requiring another attempt with GFP_NOWAIT. Some printks showed both reallocs with the tests passing: Apr 25 23:16:24 crow kernel: go again GFP_USER Apr 25 23:16:24 crow kernel: go again GFP_NOWAIT With this setup, I also forced each of the bpf_iter_udp_realloc_batch calls to return -ENOMEM to ensure that iteration ends and that the read() in userspace fails. [1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@mail.gmail.com/ [2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@linux.dev/ Signed-off-by: Jordan Rife Signed-off-by: Martin KaFai Lau --- include/linux/udp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 895240177f4f..4e1a672af4c5 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -216,6 +216,9 @@ static inline void udp_allow_gso(struct sock *sk) #define udp_portaddr_for_each_entry(__sk, list) \ hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node) +#define udp_portaddr_for_each_entry_from(__sk) \ + hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node) + #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) -- cgit v1.2.3 From 71ded61bee2af97a218f9dfa0f4019cc4cd3b029 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 3 Mar 2025 23:14:20 -0800 Subject: configfs-tsm: Namespace TSM report symbols In preparation for new + common TSM (TEE Security Manager) infrastructure, namespace the TSM report symbols in tsm.h with an _REPORT suffix to differentiate them from other incoming tsm work. Cc: Yilun Xu Cc: Samuel Ortiz Cc: Tom Lendacky Cc: Sami Mujawar Cc: Steven Price Reviewed-by: Alexey Kardashevskiy Reviewed-by: Suzuki K Poulose Reviewed-by: Kai Huang Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Steven Price Link: https://patch.msgid.link/174107246021.1288555.7203769833791489618.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- include/linux/tsm.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 11b0c525be30..431054810dca 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -6,17 +6,17 @@ #include #include -#define TSM_INBLOB_MAX 64 -#define TSM_OUTBLOB_MAX SZ_32K +#define TSM_REPORT_INBLOB_MAX 64 +#define TSM_REPORT_OUTBLOB_MAX SZ_32K /* * Privilege level is a nested permission concept to allow confidential * guests to partition address space, 4-levels are supported. */ -#define TSM_PRIVLEVEL_MAX 3 +#define TSM_REPORT_PRIVLEVEL_MAX 3 /** - * struct tsm_desc - option descriptor for generating tsm report blobs + * struct tsm_report_desc - option descriptor for generating tsm report blobs * @privlevel: optional privilege level to associate with @outblob * @inblob_len: sizeof @inblob * @inblob: arbitrary input data @@ -24,10 +24,10 @@ * @service_guid: optional service-provider service guid to attest * @service_manifest_version: optional service-provider service manifest version requested */ -struct tsm_desc { +struct tsm_report_desc { unsigned int privlevel; size_t inblob_len; - u8 inblob[TSM_INBLOB_MAX]; + u8 inblob[TSM_REPORT_INBLOB_MAX]; char *service_provider; guid_t service_guid; unsigned int service_manifest_version; @@ -44,7 +44,7 @@ struct tsm_desc { * @manifestblob: (optional) manifest data associated with the report */ struct tsm_report { - struct tsm_desc desc; + struct tsm_report_desc desc; size_t outblob_len; u8 *outblob; size_t auxblob_len; @@ -88,7 +88,7 @@ enum tsm_bin_attr_index { }; /** - * struct tsm_ops - attributes and operations for tsm instances + * struct tsm_report_ops - attributes and operations for tsm_report instances * @name: tsm id reflected in /sys/kernel/config/tsm/report/$report/provider * @privlevel_floor: convey base privlevel for nested scenarios * @report_new: Populate @report with the report blob and auxblob @@ -99,7 +99,7 @@ enum tsm_bin_attr_index { * Implementation specific ops, only one is expected to be registered at * a time i.e. only one of "sev-guest", "tdx-guest", etc. */ -struct tsm_ops { +struct tsm_report_ops { const char *name; unsigned int privlevel_floor; int (*report_new)(struct tsm_report *report, void *data); @@ -107,6 +107,6 @@ struct tsm_ops { bool (*report_bin_attr_visible)(int n); }; -int tsm_register(const struct tsm_ops *ops, void *priv); -int tsm_unregister(const struct tsm_ops *ops); +int tsm_report_register(const struct tsm_report_ops *ops, void *priv); +int tsm_report_unregister(const struct tsm_report_ops *ops); #endif /* __TSM_H */ -- cgit v1.2.3 From ca732e990fc8222a2d6782ae750304719e212fe8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:11 +0100 Subject: net: stmmac: add get_interfaces() platform method Add a get_interfaces() platform method to allow platforms to indicate to phylink which interface modes they support - which then allows phylink to validate on initialisation that the configured PHY interface mode is actually supported. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uASLn-0021Qd-Mi@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 8aed09d65b4a..537bced69c46 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -233,6 +233,8 @@ struct plat_stmmacenet_data { u8 tx_sched_algorithm; struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; + void (*get_interfaces)(struct stmmac_priv *priv, void *bsp_priv, + unsigned long *interfaces); int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); -- cgit v1.2.3 From 9d165dc58055d98658941a33fef9e5da866af3e9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:27 +0100 Subject: net: stmmac: remove speed_mode_2500() method Remove the speed_mode_2500() platform method which is no longer used or necessary, being superseded by the more flexible get_interfaces() method. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1uASM3-0021R3-2B@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 537bced69c46..26ddf95d23f9 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -241,7 +241,6 @@ struct plat_stmmacenet_data { int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); - void (*speed_mode_2500)(struct net_device *ndev, void *priv); int (*mac_finish)(struct net_device *ndev, void *priv, unsigned int mode, -- cgit v1.2.3 From 3efa66ce6ee1b55ab687b316e48e1e9ddc1f780a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 16 Apr 2025 18:29:01 +0200 Subject: rcuref: Provide rcuref_is_dead() rcuref_read() returns the number of references that are currently held. If 0 is returned then it is not safe to assume that the object ca be scheduled for deconstruction because it is marked DEAD. This happens if the return value of rcuref_put() is ignored and assumptions are made. If 0 is returned then the counter transitioned from 0 to RCUREF_NOREF. If rcuref_put() did not return to the caller then the counter did not yet transition from RCUREF_NOREF to RCUREF_DEAD. This means that there is still a chance that the counter will transition from RCUREF_NOREF to 0 meaning it is still valid and must not be deconstructed. In this brief window rcuref_read() will return 0. Provide rcuref_is_dead() to determine if the counter is marked as RCUREF_DEAD. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250416162921.513656-2-bigeasy@linutronix.de --- include/linux/rcuref.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h index 6322d8c1c6b4..2fb2af6d9824 100644 --- a/include/linux/rcuref.h +++ b/include/linux/rcuref.h @@ -30,7 +30,11 @@ static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) * rcuref_read - Read the number of held reference counts of a rcuref * @ref: Pointer to the reference count * - * Return: The number of held references (0 ... N) + * Return: The number of held references (0 ... N). The value 0 does not + * indicate that it is safe to schedule the object, protected by this reference + * counter, for deconstruction. + * If you want to know if the reference counter has been marked DEAD (as + * signaled by rcuref_put()) please use rcuread_is_dead(). */ static inline unsigned int rcuref_read(rcuref_t *ref) { @@ -40,6 +44,22 @@ static inline unsigned int rcuref_read(rcuref_t *ref) return c >= RCUREF_RELEASED ? 0 : c + 1; } +/** + * rcuref_is_dead - Check if the rcuref has been already marked dead + * @ref: Pointer to the reference count + * + * Return: True if the object has been marked DEAD. This signals that a previous + * invocation of rcuref_put() returned true on this reference counter meaning + * the protected object can safely be scheduled for deconstruction. + * Otherwise, returns false. + */ +static inline bool rcuref_is_dead(rcuref_t *ref) +{ + unsigned int c = atomic_read(&ref->refcnt); + + return (c >= RCUREF_RELEASED) && (c < RCUREF_NOREF); +} + extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); /** -- cgit v1.2.3 From 55284f70134f01fdc9cc4c4905551cc1f37abd34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Apr 2025 18:29:02 +0200 Subject: mm: Add vmalloc_huge_node() To enable node specific hash-tables using huge pages if possible. [bigeasy: use __vmalloc_node_range_noprof(), add nommu bits, inline vmalloc_huge] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250416162921.513656-3-bigeasy@linutronix.de --- include/linux/vmalloc.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 31e9ffd936e3..de95794777ad 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -168,8 +168,13 @@ void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_m int node, const void *caller) __alloc_size(1); #define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) -void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); -#define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__)) +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1); +#define vmalloc_huge_node(...) alloc_hooks(vmalloc_huge_node_noprof(__VA_ARGS__)) + +static inline void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) +{ + return vmalloc_huge_node(size, gfp_mask, NUMA_NO_NODE); +} extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); #define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__)) -- cgit v1.2.3 From 80367ad01d93ac781b0e1df246edaf006928002f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 16 Apr 2025 18:29:12 +0200 Subject: futex: Add basic infrastructure for local task local hash The futex hash is system wide and shared by all tasks. Each slot is hashed based on futex address and the VMA of the thread. Due to randomized VMAs (and memory allocations) the same logical lock (pointer) can end up in a different hash bucket on each invocation of the application. This in turn means that different applications may share a hash bucket on the first invocation but not on the second and it is not always clear which applications will be involved. This can result in high latency's to acquire the futex_hash_bucket::lock especially if the lock owner is limited to a CPU and can not be effectively PI boosted. Introduce basic infrastructure for process local hash which is shared by all threads of process. This hash will only be used for a PROCESS_PRIVATE FUTEX operation. The hashmap can be allocated via: prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, num); A `num' of 0 means that the global hash is used instead of a private hash. Other values for `num' specify the number of slots for the hash and the number must be power of two, starting with two. The prctl() returns zero on success. This function can only be used before a thread is created. The current status for the private hash can be queried via: num = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS); which return the current number of slots. The value 0 means that the global hash is used. Values greater than 0 indicate the number of slots that are used. A negative number indicates an error. For optimisation, for the private hash jhash2() uses only two arguments the address and the offset. This omits the VMA which is always the same. [peterz: Use 0 for global hash. A bit shuffling and renaming. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250416162921.513656-13-bigeasy@linutronix.de --- include/linux/futex.h | 26 ++++++++++++++++++++++++-- include/linux/mm_types.h | 5 ++++- 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/futex.h b/include/linux/futex.h index b70df27d7e85..8f1be08bef18 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -4,11 +4,11 @@ #include #include +#include #include struct inode; -struct mm_struct; struct task_struct; /* @@ -77,7 +77,22 @@ void futex_exec_release(struct task_struct *tsk); long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3); -#else +int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4); + +#ifdef CONFIG_FUTEX_PRIVATE_HASH +void futex_hash_free(struct mm_struct *mm); + +static inline void futex_mm_init(struct mm_struct *mm) +{ + mm->futex_phash = NULL; +} + +#else /* !CONFIG_FUTEX_PRIVATE_HASH */ +static inline void futex_hash_free(struct mm_struct *mm) { } +static inline void futex_mm_init(struct mm_struct *mm) { } +#endif /* CONFIG_FUTEX_PRIVATE_HASH */ + +#else /* !CONFIG_FUTEX */ static inline void futex_init_task(struct task_struct *tsk) { } static inline void futex_exit_recursive(struct task_struct *tsk) { } static inline void futex_exit_release(struct task_struct *tsk) { } @@ -88,6 +103,13 @@ static inline long do_futex(u32 __user *uaddr, int op, u32 val, { return -EINVAL; } +static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) +{ + return -EINVAL; +} +static inline void futex_hash_free(struct mm_struct *mm) { } +static inline void futex_mm_init(struct mm_struct *mm) { } + #endif #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 56d07edd01f9..a4b5661e4177 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -31,6 +31,7 @@ #define INIT_PASID 0 struct address_space; +struct futex_private_hash; struct mem_cgroup; /* @@ -1031,7 +1032,9 @@ struct mm_struct { */ seqcount_t mm_lock_seq; #endif - +#ifdef CONFIG_FUTEX_PRIVATE_HASH + struct futex_private_hash *futex_phash; +#endif unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ -- cgit v1.2.3 From 7c4f75a21f636486d2969d9b6680403ea8483539 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 16 Apr 2025 18:29:13 +0200 Subject: futex: Allow automatic allocation of process wide futex hash Allocate a private futex hash with 16 slots if a task forks its first thread. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250416162921.513656-14-bigeasy@linutronix.de --- include/linux/futex.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/futex.h b/include/linux/futex.h index 8f1be08bef18..1d3f7555825e 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -80,6 +80,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4); #ifdef CONFIG_FUTEX_PRIVATE_HASH +int futex_hash_allocate_default(void); void futex_hash_free(struct mm_struct *mm); static inline void futex_mm_init(struct mm_struct *mm) @@ -88,6 +89,7 @@ static inline void futex_mm_init(struct mm_struct *mm) } #else /* !CONFIG_FUTEX_PRIVATE_HASH */ +static inline int futex_hash_allocate_default(void) { return 0; } static inline void futex_hash_free(struct mm_struct *mm) { } static inline void futex_mm_init(struct mm_struct *mm) { } #endif /* CONFIG_FUTEX_PRIVATE_HASH */ @@ -107,6 +109,10 @@ static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsig { return -EINVAL; } +static inline int futex_hash_allocate_default(void) +{ + return 0; +} static inline void futex_hash_free(struct mm_struct *mm) { } static inline void futex_mm_init(struct mm_struct *mm) { } -- cgit v1.2.3 From bd54df5ea7cadac520e346d5f0fe5d58e635b6ba Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 16 Apr 2025 18:29:14 +0200 Subject: futex: Allow to resize the private local hash The mm_struct::futex_hash_lock guards the futex_hash_bucket assignment/ replacement. The futex_hash_allocate()/ PR_FUTEX_HASH_SET_SLOTS operation can now be invoked at runtime and resize an already existing internal private futex_hash_bucket to another size. The reallocation is based on an idea by Thomas Gleixner: The initial allocation of struct futex_private_hash sets the reference count to one. Every user acquires a reference on the local hash before using it and drops it after it enqueued itself on the hash bucket. There is no reference held while the task is scheduled out while waiting for the wake up. The resize process allocates a new struct futex_private_hash and drops the initial reference. Synchronized with mm_struct::futex_hash_lock it is checked if the reference counter for the currently used mm_struct::futex_phash is marked as DEAD. If so, then all users enqueued on the current private hash are requeued on the new private hash and the new private hash is set to mm_struct::futex_phash. Otherwise the newly allocated private hash is saved as mm_struct::futex_phash_new and the rehashing and reassigning is delayed to the futex_hash() caller once the reference counter is marked DEAD. The replacement is not performed at rcuref_put() time because certain callers, such as futex_wait_queue(), drop their reference after changing the task state. This change will be destroyed once the futex_hash_lock is acquired. The user can change the number slots with PR_FUTEX_HASH_SET_SLOTS multiple times. An increase and decrease is allowed and request blocks until the assignment is done. The private hash allocated at thread creation is changed from 16 to 16 <= 4 * number_of_threads <= global_hash_size where number_of_threads can not exceed the number of online CPUs. Should the user PR_FUTEX_HASH_SET_SLOTS then the auto scaling is disabled. [peterz: reorganize the code to avoid state tracking and simplify new object handling, block the user until changes are in effect, allow increase and decrease of the hash]. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250416162921.513656-15-bigeasy@linutronix.de --- include/linux/futex.h | 3 ++- include/linux/mm_types.h | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/futex.h b/include/linux/futex.h index 1d3f7555825e..40bc778b2bb4 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -85,7 +85,8 @@ void futex_hash_free(struct mm_struct *mm); static inline void futex_mm_init(struct mm_struct *mm) { - mm->futex_phash = NULL; + rcu_assign_pointer(mm->futex_phash, NULL); + mutex_init(&mm->futex_hash_lock); } #else /* !CONFIG_FUTEX_PRIVATE_HASH */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a4b5661e4177..32ba5126e221 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1033,7 +1033,9 @@ struct mm_struct { seqcount_t mm_lock_seq; #endif #ifdef CONFIG_FUTEX_PRIVATE_HASH - struct futex_private_hash *futex_phash; + struct mutex futex_hash_lock; + struct futex_private_hash __rcu *futex_phash; + struct futex_private_hash *futex_phash_new; #endif unsigned long hiwater_rss; /* High-watermark of RSS usage */ -- cgit v1.2.3 From cec199c5e39bde7191a08087cc3d002ccfab31ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Apr 2025 18:29:16 +0200 Subject: futex: Implement FUTEX2_NUMA Extend the futex2 interface to be numa aware. When FUTEX2_NUMA is specified for a futex, the user value is extended to two words (of the same size). The first is the user value we all know, the second one will be the node to place this futex on. struct futex_numa_32 { u32 val; u32 node; }; When node is set to ~0, WAIT will set it to the current node_id such that WAKE knows where to find it. If userspace corrupts the node value between WAIT and WAKE, the futex will not be found and no wakeup will happen. When FUTEX2_NUMA is not set, the node is simply an extension of the hash, such that traditional futexes are still interleaved over the nodes. This is done to avoid having to have a separate !numa hash-table. [bigeasy: ensure to have at least hashsize of 4 in futex_init(), add pr_info() for size and allocation information. Cast the naddr math to void*] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250416162921.513656-17-bigeasy@linutronix.de --- include/linux/futex.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/futex.h b/include/linux/futex.h index 40bc778b2bb4..eccc99751bd9 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -34,6 +34,7 @@ union futex_key { u64 i_seq; unsigned long pgoff; unsigned int offset; + /* unsigned int node; */ } shared; struct { union { @@ -42,11 +43,13 @@ union futex_key { }; unsigned long address; unsigned int offset; + /* unsigned int node; */ } private; struct { u64 ptr; unsigned long word; unsigned int offset; + unsigned int node; /* NOT hashed! */ } both; }; -- cgit v1.2.3 From c042c505210dc3453f378df432c10fff3d471bc5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Apr 2025 18:29:17 +0200 Subject: futex: Implement FUTEX2_MPOL Extend the futex2 interface to be aware of mempolicy. When FUTEX2_MPOL is specified and there is a MPOL_PREFERRED or home_node specified covering the futex address, use that hash-map. Notably, in this case the futex will go to the global node hashtable, even if it is a PRIVATE futex. When FUTEX2_NUMA|FUTEX2_MPOL is specified and the user specified node value is FUTEX_NO_NODE, the MPOL lookup (as described above) will be tried first before reverting to setting node to the local node. [bigeasy: add CONFIG_FUTEX_MPOL, add MPOL to FUTEX2_VALID_MASK, write the node only to user if FUTEX_NO_NODE was supplied] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250416162921.513656-18-bigeasy@linutronix.de --- include/linux/mmap_lock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 4706c6769902..e0eddfd306ef 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -7,6 +7,7 @@ #include #include #include +#include #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), @@ -211,6 +212,9 @@ static inline void mmap_read_unlock(struct mm_struct *mm) up_read(&mm->mmap_lock); } +DEFINE_GUARD(mmap_read_lock, struct mm_struct *, + mmap_read_lock(_T), mmap_read_unlock(_T)) + static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); -- cgit v1.2.3 From 4862c8861d902d43645a493e441c4478be1c6c44 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 29 Apr 2025 18:50:17 +0200 Subject: dm: Allow .prepare_ioctl to handle ioctls directly This adds a 'bool *forward' parameter to .prepare_ioctl, which allows device mapper targets to accept ioctls to themselves instead of the underlying device. If the target already fully handled the ioctl, it sets *forward to false and device mapper won't forward it to the underlying device any more. In order for targets to actually know what the ioctl is about and how to handle it, pass also cmd and arg. As long as targets restrict themselves to interpreting ioctls of type DM_IOCTL, this is a backwards compatible change because previously, any such ioctl would have been passed down through all device mapper layers until it reached a device that can't understand the ioctl and would return an error. Signed-off-by: Kevin Wolf Reviewed-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- include/linux/device-mapper.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index bcc6d7b69470..cb95951547ab 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -93,7 +93,14 @@ typedef void (*dm_status_fn) (struct dm_target *ti, status_type_t status_type, typedef int (*dm_message_fn) (struct dm_target *ti, unsigned int argc, char **argv, char *result, unsigned int maxlen); -typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev); +/* + * Called with *forward == true. If it remains true, the ioctl should be + * forwarded to bdev. If it is reset to false, the target already fully handled + * the ioctl and the return value is the return value for the whole ioctl. + */ +typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward); #ifdef CONFIG_BLK_DEV_ZONED typedef int (*dm_report_zones_fn) (struct dm_target *ti, -- cgit v1.2.3 From 8fd17374be8f220c26bec2b482cabf51ebbaed80 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 1 May 2025 20:37:32 +0800 Subject: crypto: api - Rename CRYPTO_ALG_REQ_CHAIN to CRYPTO_ALG_REQ_VIRT As chaining has been removed, all that remains of REQ_CHAIN is just virtual address support. Rename it before the reintroduction of batching creates confusion. Signed-off-by: Herbert Xu --- include/linux/crypto.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index b8d875b11193..b50f1954d1bb 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -133,8 +133,8 @@ */ #define CRYPTO_ALG_FIPS_INTERNAL 0x00020000 -/* Set if the algorithm supports request chains and virtual addresses. */ -#define CRYPTO_ALG_REQ_CHAIN 0x00040000 +/* Set if the algorithm supports virtual addresses. */ +#define CRYPTO_ALG_REQ_VIRT 0x00040000 /* The high bits 0xff000000 are reserved for type-specific flags. */ -- cgit v1.2.3 From 6b3754009f871083b114daacbad5b87a494da4b8 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Fri, 11 Apr 2025 14:41:10 +0200 Subject: reset: Add devm_reset_control_array_get_exclusive_released() Add the released variant of devm_reset_control_array_get_exclusive(). Needed by spi-smt32-ospi driver as same reset line is ulso used by stm32-omm driver. Signed-off-by: Patrice Chotard Reviewed-by: Philipp Zabel Link: https://lore.kernel.org/r/20250411-b4-upstream_ospi_reset_update-v2-1-4de7f5dd2a91@foss.st.com Signed-off-by: Philipp Zabel --- include/linux/reset.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index 2986ced69a02..840d75d172f6 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -1004,6 +1004,12 @@ devm_reset_control_array_get_exclusive(struct device *dev) return devm_reset_control_array_get(dev, RESET_CONTROL_EXCLUSIVE); } +static inline struct reset_control * +devm_reset_control_array_get_exclusive_released(struct device *dev) +{ + return devm_reset_control_array_get(dev, RESET_CONTROL_EXCLUSIVE_RELEASED); +} + static inline struct reset_control * devm_reset_control_array_get_shared(struct device *dev) { -- cgit v1.2.3 From 5d2ea5aebbb2f3ebde4403f9c55b2b057e5dd2d6 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 28 Apr 2025 14:34:07 +0300 Subject: RDMA/mlx5: Fix error flow upon firmware failure for RQ destruction Upon RQ destruction if the firmware command fails which is the last resource to be destroyed some SW resources were already cleaned regardless of the failure. Now properly rollback the object to its original state upon such failure. In order to avoid a use-after free in case someone tries to destroy the object again, which results in the following kernel trace: refcount_t: underflow; use-after-free. WARNING: CPU: 0 PID: 37589 at lib/refcount.c:28 refcount_warn_saturate+0xf4/0x148 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) rfkill mlx5_core(OE) mlxdevm(OE) ib_uverbs(OE) ib_core(OE) psample mlxfw(OE) mlx_compat(OE) macsec tls pci_hyperv_intf sunrpc vfat fat virtio_net net_failover failover fuse loop nfnetlink vsock_loopback vmw_vsock_virtio_transport_common vmw_vsock_vmci_transport vmw_vmci vsock xfs crct10dif_ce ghash_ce sha2_ce sha256_arm64 sha1_ce virtio_console virtio_gpu virtio_blk virtio_dma_buf virtio_mmio dm_mirror dm_region_hash dm_log dm_mod xpmem(OE) CPU: 0 UID: 0 PID: 37589 Comm: python3 Kdump: loaded Tainted: G OE ------- --- 6.12.0-54.el10.aarch64 #1 Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : refcount_warn_saturate+0xf4/0x148 lr : refcount_warn_saturate+0xf4/0x148 sp : ffff80008b81b7e0 x29: ffff80008b81b7e0 x28: ffff000133d51600 x27: 0000000000000001 x26: 0000000000000000 x25: 00000000ffffffea x24: ffff00010ae80f00 x23: ffff00010ae80f80 x22: ffff0000c66e5d08 x21: 0000000000000000 x20: ffff0000c66e0000 x19: ffff00010ae80340 x18: 0000000000000006 x17: 0000000000000000 x16: 0000000000000020 x15: ffff80008b81b37f x14: 0000000000000000 x13: 2e656572662d7265 x12: ffff80008283ef78 x11: ffff80008257efd0 x10: ffff80008283efd0 x9 : ffff80008021ed90 x8 : 0000000000000001 x7 : 00000000000bffe8 x6 : c0000000ffff7fff x5 : ffff0001fb8e3408 x4 : 0000000000000000 x3 : ffff800179993000 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff000133d51600 Call trace: refcount_warn_saturate+0xf4/0x148 mlx5_core_put_rsc+0x88/0xa0 [mlx5_ib] mlx5_core_destroy_rq_tracked+0x64/0x98 [mlx5_ib] mlx5_ib_destroy_wq+0x34/0x80 [mlx5_ib] ib_destroy_wq_user+0x30/0xc0 [ib_core] uverbs_free_wq+0x28/0x58 [ib_uverbs] destroy_hw_idr_uobject+0x34/0x78 [ib_uverbs] uverbs_destroy_uobject+0x48/0x240 [ib_uverbs] __uverbs_cleanup_ufile+0xd4/0x1a8 [ib_uverbs] uverbs_destroy_ufile_hw+0x48/0x120 [ib_uverbs] ib_uverbs_close+0x2c/0x100 [ib_uverbs] __fput+0xd8/0x2f0 __fput_sync+0x50/0x70 __arm64_sys_close+0x40/0x90 invoke_syscall.constprop.0+0x74/0xd0 do_el0_svc+0x48/0xe8 el0_svc+0x44/0x1d0 el0t_64_sync_handler+0x120/0x130 el0t_64_sync+0x1a4/0x1a8 Fixes: e2013b212f9f ("net/mlx5_core: Add RQ and SQ event handling") Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/3181433ccdd695c63560eeeb3f0c990961732101.1745839855.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d1dfbad9a447..e6ba8f4f4bd1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -398,6 +398,7 @@ struct mlx5_core_rsc_common { enum mlx5_res_type res; refcount_t refcount; struct completion free; + bool invalid; }; struct mlx5_uars_page { -- cgit v1.2.3 From 575ec9b0c2f11f40535ea737ed5a64792780d1ef Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Fri, 25 Apr 2025 19:36:50 +0530 Subject: dma-fence: Add helper to sort and deduplicate dma_fence arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export a new helper function `dma_fence_dedup_array()` that sorts an array of dma_fence pointers by context, then deduplicates the array by retaining only the most recent fence per context. This utility is useful when merging or optimizing sets of fences where redundant entries from the same context can be pruned. The operation is performed in-place and releases references to dropped fences using dma_fence_put(). v2: - Export this code from dma-fence-unwrap.c(by Christian). v3: - To split this in a dma_buf patch and amd userq patch(by Sunil). - No need to add a new function just re-use existing(by Christian). v4: - Export dma_fence_dedub_array and use it(by Christian). Cc: Alex Deucher Cc: Arunpravin Paneer Selvam Reviewed-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Arvind Yadav Signed-off-by: Alex Deucher --- include/linux/dma-fence-unwrap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-fence-unwrap.h b/include/linux/dma-fence-unwrap.h index 66b1e56fbb81..62df222fe0f1 100644 --- a/include/linux/dma-fence-unwrap.h +++ b/include/linux/dma-fence-unwrap.h @@ -52,6 +52,8 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, struct dma_fence **fences, struct dma_fence_unwrap *cursors); +int dma_fence_dedup_array(struct dma_fence **array, int num_fences); + /** * dma_fence_unwrap_merge - unwrap and merge fences * -- cgit v1.2.3 From eeadd68e2a5f6bfe0bf1038ec49e3a8d99eb5fe8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 May 2025 10:11:25 +0200 Subject: block: remove bounce buffering support The block layer bounce buffering support is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: John Garry Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250505081138.3435992-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - include/linux/blkdev.h | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index dce7615c35e7..5a46067e85b1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -286,7 +286,6 @@ struct bio { enum { BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */ BIO_CLONED, /* doesn't own data */ - BIO_BOUNCED, /* bio is a bounce bio */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3d74f9dae8e..5ccb961ee2ae 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -331,9 +331,6 @@ typedef unsigned int __bitwise blk_features_t; /* skip this queue in blk_mq_(un)quiesce_tagset */ #define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) -/* bounce all highmem pages */ -#define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14)) - /* undocumented magic for bcache */ #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) @@ -347,7 +344,7 @@ typedef unsigned int __bitwise blk_features_t; */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ - BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \ + BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | \ BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) /* internal flags in queue_limits.flags */ -- cgit v1.2.3 From 194df9f66db8d6f74f03c78c2ad47b74a5a8b886 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 May 2025 10:11:26 +0200 Subject: mm: remove NR_BOUNCE zone stat The stat is always 0 now, so remove it and hardwire the user visible output to 0. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250505081138.3435992-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/mmzone.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6ccec1bf2896..b1c459f7a485 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -148,7 +148,6 @@ enum zone_stat_item { NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ /* Second 128 byte cacheline */ - NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif -- cgit v1.2.3 From f4fcfdda2fd8834c62dcb9bfddcf1f89d190b70e Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 23 Apr 2025 14:42:13 -0500 Subject: of: reserved_mem: Add functions to parse "memory-region" Drivers with "memory-region" properties currently have to do their own parsing of "memory-region" properties. The result is all the drivers have similar patterns of a call to parse "memory-region" and then get the region's address and size. As this is a standard property, it should have common functions for drivers to use. Add new functions to count the number of regions and retrieve the region's address as a resource. Reviewed-by: Daniel Baluta Acked-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20250423-dt-memory-region-v2-v2-1-2fbd6ebd3c88@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/of_reserved_mem.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index e338282da652..f573423359f4 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -7,6 +7,7 @@ struct of_phandle_args; struct reserved_mem_ops; +struct resource; struct reserved_mem { const char *name; @@ -39,6 +40,12 @@ int of_reserved_mem_device_init_by_name(struct device *dev, void of_reserved_mem_device_release(struct device *dev); struct reserved_mem *of_reserved_mem_lookup(struct device_node *np); +int of_reserved_mem_region_to_resource(const struct device_node *np, + unsigned int idx, struct resource *res); +int of_reserved_mem_region_to_resource_byname(const struct device_node *np, + const char *name, struct resource *res); +int of_reserved_mem_region_count(const struct device_node *np); + #else #define RESERVEDMEM_OF_DECLARE(name, compat, init) \ @@ -63,6 +70,25 @@ static inline struct reserved_mem *of_reserved_mem_lookup(struct device_node *np { return NULL; } + +static inline int of_reserved_mem_region_to_resource(const struct device_node *np, + unsigned int idx, + struct resource *res) +{ + return -ENOSYS; +} + +static inline int of_reserved_mem_region_to_resource_byname(const struct device_node *np, + const char *name, + struct resource *res) +{ + return -ENOSYS; +} + +static inline int of_reserved_mem_region_count(const struct device_node *np) +{ + return 0; +} #endif /** -- cgit v1.2.3 From 23227e71b69af95e421e263302d13f426c548155 Mon Sep 17 00:00:00 2001 From: Guan-Chun Wu <409411716@gms.tku.edu.tw> Date: Sat, 3 May 2025 16:12:03 +0800 Subject: workqueue: fix typo in comment Fix a duplicated word "that that" in the comment describing the @max_active behavior for unbound workqueues. Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index b0dc957c3e56..a6dacc0eb688 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -480,7 +480,7 @@ void workqueue_softirq_dead(unsigned int cpu); * executing at most one work item for the workqueue. * * For unbound workqueues, @max_active limits the number of in-flight work items - * for the whole system. e.g. @max_active of 16 indicates that that there can be + * for the whole system. e.g. @max_active of 16 indicates that there can be * at most 16 work items executing for the workqueue in the whole system. * * As sharing the same active counter for an unbound workqueue across multiple -- cgit v1.2.3 From a25e7962db0d79882c9d32fd2a67a02e79129c0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 May 2025 10:01:38 +0300 Subject: PCI/P2PDMA: Refactor the p2pdma mapping helpers The current scheme with a single helper to determine the P2P status and map a scatterlist segment force users to always use the map_sg helper to DMA map, which we're trying to get away from because they are very cache inefficient. Refactor the code so that there is a single helper that checks the P2P state for a page, including the result that it is not a P2P page to simplify the callers, and a second one to perform the address translation for a bus mapped P2P transfer that does not depend on the scatterlist structure. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Acked-by: Bjorn Helgaas Tested-by: Jens Axboe Reviewed-by: Luis Chamberlain Reviewed-by: Lu Baolu Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski --- include/linux/dma-map-ops.h | 51 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index e172522cd936..c3086edeccc6 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -443,6 +443,11 @@ enum pci_p2pdma_map_type { */ PCI_P2PDMA_MAP_UNKNOWN = 0, + /* + * Not a PCI P2PDMA transfer. + */ + PCI_P2PDMA_MAP_NONE, + /* * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will * traverse the host bridge and the host bridge is not in the @@ -471,21 +476,47 @@ enum pci_p2pdma_map_type { struct pci_p2pdma_map_state { struct dev_pagemap *pgmap; - int map; + enum pci_p2pdma_map_type map; u64 bus_off; }; -#ifdef CONFIG_PCI_P2PDMA -enum pci_p2pdma_map_type -pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev, - struct scatterlist *sg); -#else /* CONFIG_PCI_P2PDMA */ +/* helper for pci_p2pdma_state(), do not use directly */ +void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, + struct device *dev, struct page *page); + +/** + * pci_p2pdma_state - check the P2P transfer state of a page + * @state: P2P state structure + * @dev: device to transfer to/from + * @page: page to map + * + * Check if @page is a PCI P2PDMA page, and if yes of what kind. Returns the + * map type, and updates @state with all information needed for a P2P transfer. + */ static inline enum pci_p2pdma_map_type -pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev, - struct scatterlist *sg) +pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, + struct page *page) +{ + if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { + if (state->pgmap != page_pgmap(page)) + __pci_p2pdma_update_state(state, dev, page); + return state->map; + } + return PCI_P2PDMA_MAP_NONE; +} + +/** + * pci_p2pdma_bus_addr_map - map a PCI_P2PDMA_MAP_BUS_ADDR P2P transfer + * @state: P2P state structure + * @paddr: physical address to map + * + * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer. + */ +static inline dma_addr_t +pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) { - return PCI_P2PDMA_MAP_NOT_SUPPORTED; + WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); + return paddr + state->bus_off; } -#endif /* CONFIG_PCI_P2PDMA */ #endif /* _LINUX_DMA_MAP_OPS_H */ -- cgit v1.2.3 From ca2c2e4a78c619bcf1c1df29fee5981035f3fcbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 May 2025 10:01:39 +0300 Subject: dma-mapping: move the PCI P2PDMA mapping helpers to pci-p2pdma.h To support the upcoming non-scatterlist mapping helpers, we need to go back to have them called outside of the DMA API. Thus move them out of dma-map-ops.h, which is only for DMA API implementations to pci-p2pdma.h, which is for driver use. Note that the core helper is still not exported as the mapping is expected to be done only by very highlevel subsystem code at least for now. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Acked-by: Bjorn Helgaas Tested-by: Jens Axboe Reviewed-by: Luis Chamberlain Reviewed-by: Lu Baolu Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski --- include/linux/dma-map-ops.h | 85 --------------------------------------------- include/linux/pci-p2pdma.h | 85 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 85 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index c3086edeccc6..f48e5fb88bd5 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -434,89 +434,4 @@ static inline void debug_dma_dump_mappings(struct device *dev) #endif /* CONFIG_DMA_API_DEBUG */ extern const struct dma_map_ops dma_dummy_ops; - -enum pci_p2pdma_map_type { - /* - * PCI_P2PDMA_MAP_UNKNOWN: Used internally for indicating the mapping - * type hasn't been calculated yet. Functions that return this enum - * never return this value. - */ - PCI_P2PDMA_MAP_UNKNOWN = 0, - - /* - * Not a PCI P2PDMA transfer. - */ - PCI_P2PDMA_MAP_NONE, - - /* - * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will - * traverse the host bridge and the host bridge is not in the - * allowlist. DMA Mapping routines should return an error when - * this is returned. - */ - PCI_P2PDMA_MAP_NOT_SUPPORTED, - - /* - * PCI_P2PDMA_BUS_ADDR: Indicates that two devices can talk to - * each other directly through a PCI switch and the transaction will - * not traverse the host bridge. Such a mapping should program - * the DMA engine with PCI bus addresses. - */ - PCI_P2PDMA_MAP_BUS_ADDR, - - /* - * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk - * to each other, but the transaction traverses a host bridge on the - * allowlist. In this case, a normal mapping either with CPU physical - * addresses (in the case of dma-direct) or IOVA addresses (in the - * case of IOMMUs) should be used to program the DMA engine. - */ - PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, -}; - -struct pci_p2pdma_map_state { - struct dev_pagemap *pgmap; - enum pci_p2pdma_map_type map; - u64 bus_off; -}; - -/* helper for pci_p2pdma_state(), do not use directly */ -void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, - struct device *dev, struct page *page); - -/** - * pci_p2pdma_state - check the P2P transfer state of a page - * @state: P2P state structure - * @dev: device to transfer to/from - * @page: page to map - * - * Check if @page is a PCI P2PDMA page, and if yes of what kind. Returns the - * map type, and updates @state with all information needed for a P2P transfer. - */ -static inline enum pci_p2pdma_map_type -pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, - struct page *page) -{ - if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - if (state->pgmap != page_pgmap(page)) - __pci_p2pdma_update_state(state, dev, page); - return state->map; - } - return PCI_P2PDMA_MAP_NONE; -} - -/** - * pci_p2pdma_bus_addr_map - map a PCI_P2PDMA_MAP_BUS_ADDR P2P transfer - * @state: P2P state structure - * @paddr: physical address to map - * - * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer. - */ -static inline dma_addr_t -pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) -{ - WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->bus_off; -} - #endif /* _LINUX_DMA_MAP_OPS_H */ diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 2c07aa6b7665..075c20b161d9 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -104,4 +104,89 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } +enum pci_p2pdma_map_type { + /* + * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before + * the mapping type has been calculated. Exported routines for the API + * will never return this value. + */ + PCI_P2PDMA_MAP_UNKNOWN = 0, + + /* + * Not a PCI P2PDMA transfer. + */ + PCI_P2PDMA_MAP_NONE, + + /* + * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will + * traverse the host bridge and the host bridge is not in the + * allowlist. DMA Mapping routines should return an error when + * this is returned. + */ + PCI_P2PDMA_MAP_NOT_SUPPORTED, + + /* + * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to + * each other directly through a PCI switch and the transaction will + * not traverse the host bridge. Such a mapping should program + * the DMA engine with PCI bus addresses. + */ + PCI_P2PDMA_MAP_BUS_ADDR, + + /* + * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk + * to each other, but the transaction traverses a host bridge on the + * allowlist. In this case, a normal mapping either with CPU physical + * addresses (in the case of dma-direct) or IOVA addresses (in the + * case of IOMMUs) should be used to program the DMA engine. + */ + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; + +struct pci_p2pdma_map_state { + struct dev_pagemap *pgmap; + enum pci_p2pdma_map_type map; + u64 bus_off; +}; + +/* helper for pci_p2pdma_state(), do not use directly */ +void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, + struct device *dev, struct page *page); + +/** + * pci_p2pdma_state - check the P2P transfer state of a page + * @state: P2P state structure + * @dev: device to transfer to/from + * @page: page to map + * + * Check if @page is a PCI P2PDMA page, and if yes of what kind. Returns the + * map type, and updates @state with all information needed for a P2P transfer. + */ +static inline enum pci_p2pdma_map_type +pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, + struct page *page) +{ + if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { + if (state->pgmap != page_pgmap(page)) + __pci_p2pdma_update_state(state, dev, page); + return state->map; + } + return PCI_P2PDMA_MAP_NONE; +} + +/** + * pci_p2pdma_bus_addr_map - Translate a physical address to a bus address + * for a PCI_P2PDMA_MAP_BUS_ADDR transfer. + * @state: P2P state structure + * @paddr: physical address to map + * + * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer. + */ +static inline dma_addr_t +pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) +{ + WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); + return paddr + state->bus_off; +} + #endif /* _LINUX_PCI_P2P_H */ -- cgit v1.2.3 From 5c87cffe2d3853cfae61e9373ee98a0409839178 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 May 2025 10:01:40 +0300 Subject: iommu: generalize the batched sync after map interface For the upcoming IOVA-based DMA API we want to batch the ops->iotlb_sync_map() call after mapping multiple IOVAs from dma-iommu without having a scatterlist. Improve the API. Add a wrapper for the map_sync as iommu_sync_map() so that callers don't need to poke into the methods directly. Formalize __iommu_map() into iommu_map_nosync() which requires the caller to call iommu_sync_map() after all maps are completed. Refactor the existing sanity checks from all the different layers into iommu_map_nosync(). Signed-off-by: Christoph Hellwig Acked-by: Will Deacon Tested-by: Jens Axboe Reviewed-by: Jason Gunthorpe Reviewed-by: Luis Chamberlain Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski --- include/linux/iommu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ccce8a751e2a..ce472af8e9c3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -872,6 +872,10 @@ extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); +int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp); +int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, + size_t size); extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size); extern size_t iommu_unmap_fast(struct iommu_domain *domain, -- cgit v1.2.3 From 393cf700e6242032fbad51b248111ab6740ce8ad Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 5 May 2025 10:01:42 +0300 Subject: dma-mapping: Provide an interface to allow allocate IOVA The existing .map_pages() callback provides both allocating of IOVA and linking DMA pages. That combination works great for most of the callers who use it in control paths, but is less effective in fast paths where there may be multiple calls to map_page(). These advanced callers already manage their data in some sort of database and can perform IOVA allocation in advance, leaving range linkage operation to be in fast path. Provide an interface to allocate/deallocate IOVA and next patch link/unlink DMA ranges to that specific IOVA. In the new API a DMA mapping transaction is identified by a struct dma_iova_state, which holds some recomputed information for the transaction which does not change for each page being mapped, so add a check if IOVA can be used for the specific transaction. The API is exported from dma-iommu as it is the only implementation supported, the namespace is clearly different from iommu_* functions which are not allowed to be used. This code layout allows us to save function call per API call used in datapath as well as a lot of boilerplate code. Reviewed-by: Christoph Hellwig Tested-by: Jens Axboe Reviewed-by: Luis Chamberlain Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski --- include/linux/dma-mapping.h | 48 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index b79925b1c433..de7f73810d54 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -72,6 +72,22 @@ #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) +struct dma_iova_state { + dma_addr_t addr; + u64 __size; +}; + +/* + * Use the high bit to mark if we used swiotlb for one or more ranges. + */ +#define DMA_IOVA_USE_SWIOTLB (1ULL << 63) + +static inline size_t dma_iova_size(struct dma_iova_state *state) +{ + /* Casting is needed for 32-bits systems */ + return (size_t)(state->__size & ~DMA_IOVA_USE_SWIOTLB); +} + #ifdef CONFIG_DMA_API_DEBUG void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr); void debug_dma_map_single(struct device *dev, const void *addr, @@ -277,6 +293,38 @@ static inline int dma_mmap_noncontiguous(struct device *dev, } #endif /* CONFIG_HAS_DMA */ +#ifdef CONFIG_IOMMU_DMA +/** + * dma_use_iova - check if the IOVA API is used for this state + * @state: IOVA state + * + * Return %true if the DMA transfers uses the dma_iova_*() calls or %false if + * they can't be used. + */ +static inline bool dma_use_iova(struct dma_iova_state *state) +{ + return state->__size != 0; +} + +bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t size); +void dma_iova_free(struct device *dev, struct dma_iova_state *state); +#else /* CONFIG_IOMMU_DMA */ +static inline bool dma_use_iova(struct dma_iova_state *state) +{ + return false; +} +static inline bool dma_iova_try_alloc(struct device *dev, + struct dma_iova_state *state, phys_addr_t phys, size_t size) +{ + return false; +} +static inline void dma_iova_free(struct device *dev, + struct dma_iova_state *state) +{ +} +#endif /* CONFIG_IOMMU_DMA */ + #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); -- cgit v1.2.3 From 433a76207dcf5facc0183acb790f6e8398585258 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 5 May 2025 10:01:44 +0300 Subject: dma-mapping: Implement link/unlink ranges API Introduce new DMA APIs to perform DMA linkage of buffers in layers higher than DMA. In proposed API, the callers will perform the following steps. In map path: if (dma_can_use_iova(...)) dma_iova_alloc() for (page in range) dma_iova_link_next(...) dma_iova_sync(...) else /* Fallback to legacy map pages */ for (all pages) dma_map_page(...) In unmap path: if (dma_can_use_iova(...)) dma_iova_destroy() else for (all pages) dma_unmap_page(...) Reviewed-by: Christoph Hellwig Tested-by: Jens Axboe Reviewed-by: Luis Chamberlain Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski --- include/linux/dma-mapping.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index de7f73810d54..a71e110f1e9d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -309,6 +309,17 @@ static inline bool dma_use_iova(struct dma_iova_state *state) bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state, phys_addr_t phys, size_t size); void dma_iova_free(struct device *dev, struct dma_iova_state *state); +void dma_iova_destroy(struct device *dev, struct dma_iova_state *state, + size_t mapped_len, enum dma_data_direction dir, + unsigned long attrs); +int dma_iova_sync(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size); +int dma_iova_link(struct device *dev, struct dma_iova_state *state, + phys_addr_t phys, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs); +void dma_iova_unlink(struct device *dev, struct dma_iova_state *state, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs); #else /* CONFIG_IOMMU_DMA */ static inline bool dma_use_iova(struct dma_iova_state *state) { @@ -323,6 +334,27 @@ static inline void dma_iova_free(struct device *dev, struct dma_iova_state *state) { } +static inline void dma_iova_destroy(struct device *dev, + struct dma_iova_state *state, size_t mapped_len, + enum dma_data_direction dir, unsigned long attrs) +{ +} +static inline int dma_iova_sync(struct device *dev, + struct dma_iova_state *state, size_t offset, size_t size) +{ + return -EOPNOTSUPP; +} +static inline int dma_iova_link(struct device *dev, + struct dma_iova_state *state, phys_addr_t phys, size_t offset, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + return -EOPNOTSUPP; +} +static inline void dma_iova_unlink(struct device *dev, + struct dma_iova_state *state, size_t offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ +} #endif /* CONFIG_IOMMU_DMA */ #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) -- cgit v1.2.3 From 5f3b133a23c545272b6bc1e818a5a35e1ca84b1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 May 2025 10:01:45 +0300 Subject: dma-mapping: add a dma_need_unmap helper Add helper that allows a driver to skip calling dma_unmap_* if the DMA layer can guarantee that they are no-nops. Signed-off-by: Christoph Hellwig Tested-by: Jens Axboe Reviewed-by: Luis Chamberlain Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski --- include/linux/dma-mapping.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index a71e110f1e9d..d2f358c5a25d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -406,6 +406,7 @@ static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { return dma_dev_need_sync(dev) ? __dma_need_sync(dev, dma_addr) : false; } +bool dma_need_unmap(struct device *dev); #else /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */ static inline bool dma_dev_need_sync(const struct device *dev) { @@ -431,6 +432,10 @@ static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { return false; } +static inline bool dma_need_unmap(struct device *dev) +{ + return false; +} #endif /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */ struct page *dma_alloc_pages(struct device *dev, size_t size, -- cgit v1.2.3 From 56dee46ff47f0ef9944dddd1fd137c94b7c2d9de Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 5 May 2025 22:17:40 +0800 Subject: block: move ELEVATOR_FLAG_DISABLE_WBT a request queue flag ELEVATOR_FLAG_DISABLE_WBT is only used by BFQ to disallow wbt when BFQ is in use. The flag is set in BFQ's init(), and cleared in BFQ's exit(). Making it as request queue flag, so that we can avoid to deal with elevator switch race. Also it isn't graceful to checking one scheduler flag in wbt_enable_default(). Reviewed-by: Christoph Hellwig Reviewed-by: Nilay Shroff Reviewed-by: Hannes Reinecke Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250505141805.2751237-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5ccb961ee2ae..c36d7a1c2cc0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -641,6 +641,7 @@ enum { QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */ QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ + QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_MAX }; @@ -676,6 +677,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) #define blk_queue_skip_tagset_quiesce(q) \ ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) +#define blk_queue_disable_wbt(q) \ + test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From 98e68f67020ce30e1a4d8e2d05d85a453738dfb8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 5 May 2025 22:17:46 +0800 Subject: block: prevent adding/deleting disk during updating nr_hw_queues Both adding/deleting disk code are reader of `nr_hw_queues`, so we can't allow them in-progress when updating nr_hw_queues, kernel panic and kasan has been reported in [1]. Prevent adding/deleting disk during updating nr_hw_queues by adding rw_semaphore to tagset, write lock is grabbed in blk_mq_update_nr_hw_queues(), and read lock is acquired when adding/deleting disk. Also mark GFP_NOIO allocation scope for adding/deleting disk because blk_mq_update_nr_hw_queues() is part of some driver's error handler. This way avoids lot of trouble. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Nilay Shroff Suggested-by: Nilay Shroff Reported-by: Nilay Shroff Closes: https://lore.kernel.org/linux-block/a5896cdb-a59a-4a37-9f99-20522f5d2987@linux.ibm.com/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250505141805.2751237-9-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8eb9b3310167..ef84d53095a6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -9,6 +9,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -527,6 +528,8 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; + + struct rw_semaphore update_nr_hwq_lock; }; /** -- cgit v1.2.3 From 21eed794ab4bd1a6c82a55df4416d18fb4d21da9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 5 May 2025 22:17:58 +0800 Subject: block: add new helper for disabling elevator switch when deleting disk Add new helper disable_elv_switch() and new flag QUEUE_FLAG_NO_ELV_SWITCH for disabling elevator switch before deleting disk: - originally flag QUEUE_FLAG_REGISTERED is added for preventing elevator switch during removing disk, but this flag has been used widely for other purposes, so add one new flag for disabling elevator switch only - for avoiding deadlock risk, we have to move elevator queue register/unregister out of elevator lock and queue freeze, which will be done in next patch. However, this way adds small race window between elevator switch and deleting ->queue_kobj, in which elevator queue register/unregister could be run concurrently. The added helper will be used for avoiding the race in the following patch. - drain in-progress elevator switch before deleting disk Suggested-by: Nilay Shroff Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Nilay Shroff Link: https://lore.kernel.org/r/20250505141805.2751237-21-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c36d7a1c2cc0..3aa1fd637d57 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -642,6 +642,7 @@ enum { QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ + QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_MAX }; @@ -679,6 +680,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) #define blk_queue_disable_wbt(q) \ test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) +#define blk_queue_no_elv_switch(q) \ + test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From 78c271344b6f64ce24c845e54903e09928cf2061 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 5 May 2025 22:18:03 +0800 Subject: block: move wbt_enable_default() out of queue freezing from sched ->exit() scheduler's ->exit() is called with queue frozen and elevator lock is held, and wbt_enable_default() can't be called with queue frozen, otherwise the following lockdep warning is triggered: #6 (&q->rq_qos_mutex){+.+.}-{4:4}: #5 (&eq->sysfs_lock){+.+.}-{4:4}: #4 (&q->elevator_lock){+.+.}-{4:4}: #3 (&q->q_usage_counter(io)#3){++++}-{0:0}: #2 (fs_reclaim){+.+.}-{0:0}: #1 (&sb->s_type->i_mutex_key#3){+.+.}-{4:4}: #0 (&q->debugfs_mutex){+.+.}-{4:4}: Fix the issue by moving wbt_enable_default() out of bfq's exit(), and call it from elevator_change_done(). Meantime add disk->rqos_state_mutex for covering wbt state change, which matches the purpose more than ->elevator_lock. Reviewed-by: Hannes Reinecke Reviewed-by: Nilay Shroff Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250505141805.2751237-26-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3aa1fd637d57..94323f303b37 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -218,6 +218,8 @@ struct gendisk { * devices that do not have multiple independent access ranges. */ struct blk_independent_access_ranges *ia_ranges; + + struct mutex rqos_state_mutex; /* rqos state change mutex */ }; /** -- cgit v1.2.3 From 732f25a2895a8c1c54fb56544f0b1e23770ef4d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 May 2025 17:47:22 +0530 Subject: fs: add a write stream field to the kiocb Prepare for io_uring passthrough of write streams. The write stream field in the kiocb structure fits into an existing 2-byte hole, so its size is not changed. Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-2-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..d5988867fe31 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -408,6 +408,7 @@ struct kiocb { void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ + u8 ki_write_stream; union { /* * Only used for async buffered reads, where it denotes the -- cgit v1.2.3 From 5006f85ea23ea0bda9a8e31fdda126f4fca48f20 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 May 2025 17:47:23 +0530 Subject: block: add a bi_write_stream field Add the ability to pass a write stream for placement control in the bio. The new field fits in an existing hole, so does not change the size of the struct. Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-3-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5a46067e85b1..f38425338c3f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -220,6 +220,7 @@ struct bio { unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; enum rw_hint bi_write_hint; + u8 bi_write_stream; blk_status_t bi_status; atomic_t __bi_remaining; -- cgit v1.2.3 From d2f526ba27d29c442542f7c5df0a86ef0b576716 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 6 May 2025 17:47:24 +0530 Subject: block: introduce max_write_streams queue limit Drivers with hardware that support write streams need a way to export how many are available so applications can generically query this. Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Keith Busch [hch: renamed hints to streams, removed stacking] Signed-off-by: Christoph Hellwig Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-4-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 94323f303b37..13e1426ad38d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -404,6 +404,8 @@ struct queue_limits { unsigned short max_integrity_segments; unsigned short max_discard_segments; + unsigned short max_write_streams; + unsigned int max_open_zones; unsigned int max_active_zones; @@ -1270,6 +1272,13 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev) return queue_max_segments(bdev_get_queue(bdev)); } +static inline unsigned short bdev_max_write_streams(struct block_device *bdev) +{ + if (bdev_is_partition(bdev)) + return 0; + return bdev_limits(bdev)->max_write_streams; +} + static inline unsigned queue_logical_block_size(const struct request_queue *q) { return q->limits.logical_block_size; -- cgit v1.2.3 From c23acfac10786ac5062a0615e23e68b913ac8da0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 May 2025 17:47:25 +0530 Subject: block: introduce a write_stream_granularity queue limit Export the granularity that write streams should be discarded with, as it is essential for making good use of them. Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-5-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 13e1426ad38d..52b7a27c46e8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -405,6 +405,7 @@ struct queue_limits { unsigned short max_discard_segments; unsigned short max_write_streams; + unsigned int write_stream_granularity; unsigned int max_open_zones; unsigned int max_active_zones; -- cgit v1.2.3 From ee203d3d86113559b77b1723e0d10909ebbd66ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 May 2025 17:47:30 +0530 Subject: nvme: add FDP definitions Add the config feature result, config log page, and management receive commands needed for FDP. Partially based on a patch from Kanchan Joshi . Reviewed-by: Hannes Reinecke Reviewed-by: Nitesh Shetty Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250506121732.8211-10-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/nvme.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2479ed10f53e..51308f65b72f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -303,6 +303,7 @@ enum nvme_ctrl_attr { NVME_CTRL_ATTR_TBKAS = (1 << 6), NVME_CTRL_ATTR_ELBAS = (1 << 15), NVME_CTRL_ATTR_RHII = (1 << 18), + NVME_CTRL_ATTR_FDPS = (1 << 19), }; struct nvme_id_ctrl { @@ -689,6 +690,44 @@ struct nvme_rotational_media_log { __u8 rsvd24[488]; }; +struct nvme_fdp_config { + __u8 flags; +#define FDPCFG_FDPE (1U << 0) + __u8 fdpcidx; + __le16 reserved; +}; + +struct nvme_fdp_ruh_desc { + __u8 ruht; + __u8 reserved[3]; +}; + +struct nvme_fdp_config_desc { + __le16 dsze; + __u8 fdpa; + __u8 vss; + __le32 nrg; + __le16 nruh; + __le16 maxpids; + __le32 nns; + __le64 runs; + __le32 erutl; + __u8 rsvd28[36]; + struct nvme_fdp_ruh_desc ruhs[]; +}; + +struct nvme_fdp_config_log { + __le16 numfdpc; + __u8 ver; + __u8 rsvd3; + __le32 sze; + __u8 rsvd8[8]; + /* + * This is followed by variable number of nvme_fdp_config_desc + * structures, but sparse doesn't like nested variable sized arrays. + */ +}; + struct nvme_smart_log { __u8 critical_warning; __u8 temperature[2]; @@ -915,6 +954,7 @@ enum nvme_opcode { nvme_cmd_resv_register = 0x0d, nvme_cmd_resv_report = 0x0e, nvme_cmd_resv_acquire = 0x11, + nvme_cmd_io_mgmt_recv = 0x12, nvme_cmd_resv_release = 0x15, nvme_cmd_zone_mgmt_send = 0x79, nvme_cmd_zone_mgmt_recv = 0x7a, @@ -936,6 +976,7 @@ enum nvme_opcode { nvme_opcode_name(nvme_cmd_resv_register), \ nvme_opcode_name(nvme_cmd_resv_report), \ nvme_opcode_name(nvme_cmd_resv_acquire), \ + nvme_opcode_name(nvme_cmd_io_mgmt_recv), \ nvme_opcode_name(nvme_cmd_resv_release), \ nvme_opcode_name(nvme_cmd_zone_mgmt_send), \ nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \ @@ -1087,6 +1128,7 @@ enum { NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, NVME_RW_DTYPE_STREAMS = 1 << 4, + NVME_RW_DTYPE_DPLCMT = 2 << 4, NVME_WZ_DEAC = 1 << 9, }; @@ -1174,6 +1216,38 @@ struct nvme_zone_mgmt_recv_cmd { __le32 cdw14[2]; }; +struct nvme_io_mgmt_recv_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 mo; + __u8 rsvd11; + __u16 mos; + __le32 numd; + __le32 cdw12[4]; +}; + +enum { + NVME_IO_MGMT_RECV_MO_RUHS = 1, +}; + +struct nvme_fdp_ruh_status_desc { + __le16 pid; + __le16 ruhid; + __le32 earutr; + __le64 ruamw; + __u8 reserved[16]; +}; + +struct nvme_fdp_ruh_status { + __u8 rsvd0[14]; + __le16 nruhsd; + struct nvme_fdp_ruh_status_desc ruhsd[]; +}; + enum { NVME_ZRA_ZONE_REPORT = 0, NVME_ZRASF_ZONE_REPORT_ALL = 0, @@ -1309,6 +1383,7 @@ enum { NVME_FEAT_PLM_WINDOW = 0x14, NVME_FEAT_HOST_BEHAVIOR = 0x16, NVME_FEAT_SANITIZE = 0x17, + NVME_FEAT_FDP = 0x1d, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, @@ -1329,6 +1404,7 @@ enum { NVME_LOG_ANA = 0x0c, NVME_LOG_FEATURES = 0x12, NVME_LOG_RMI = 0x16, + NVME_LOG_FDP_CONFIGS = 0x20, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), @@ -1923,6 +1999,7 @@ struct nvme_command { struct nvmf_auth_receive_command auth_receive; struct nvme_dbbuf dbbuf; struct nvme_directive_cmd directive; + struct nvme_io_mgmt_recv_cmd imr; }; }; -- cgit v1.2.3 From bb5eb8a5b222fa5092f60d5555867a05ebc3bdf2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 22 Apr 2025 19:56:38 +0800 Subject: f2fs: fix to bail out in get_new_segment() ------------[ cut here ]------------ WARNING: CPU: 3 PID: 579 at fs/f2fs/segment.c:2832 new_curseg+0x5e8/0x6dc pc : new_curseg+0x5e8/0x6dc Call trace: new_curseg+0x5e8/0x6dc f2fs_allocate_data_block+0xa54/0xe28 do_write_page+0x6c/0x194 f2fs_do_write_node_page+0x38/0x78 __write_node_page+0x248/0x6d4 f2fs_sync_node_pages+0x524/0x72c f2fs_write_checkpoint+0x4bc/0x9b0 __checkpoint_and_complete_reqs+0x80/0x244 issue_checkpoint_thread+0x8c/0xec kthread+0x114/0x1bc ret_from_fork+0x10/0x20 get_new_segment() detects inconsistent status in between free_segmap and free_secmap, let's record such error into super block, and bail out get_new_segment() instead of continue using the segment. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c24f8bc01045..5206d63b3386 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -78,6 +78,7 @@ enum stop_cp_reason { STOP_CP_REASON_UPDATE_INODE, STOP_CP_REASON_FLUSH_FAIL, STOP_CP_REASON_NO_SEGMENT, + STOP_CP_REASON_CORRUPTED_FREE_BITMAP, STOP_CP_REASON_MAX, }; -- cgit v1.2.3 From 2dbf6e0df447d1542f8fd158b17a06d2e8ede15e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 May 2025 21:04:08 -0400 Subject: kill vfs_submount() The last remaining user of vfs_submount() (tracefs) is easy to convert to fs_context_for_submount(); do that and bury that thing, along with SB_SUBMOUNT Reviewed-by: Jan Kara Acked-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Al Viro --- include/linux/fs.h | 1 - include/linux/mount.h | 3 --- 2 files changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..515e702d98ae 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1240,7 +1240,6 @@ extern int send_sigurg(struct file *file); /* These sb flags are internal to the kernel */ #define SB_DEAD BIT(21) #define SB_DYING BIT(24) -#define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) #define SB_BORN BIT(29) diff --git a/include/linux/mount.h b/include/linux/mount.h index dcc17ce8a959..d4eb90a367af 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -98,9 +98,6 @@ extern struct vfsmount *vfs_create_mount(struct fs_context *fc); extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); -extern struct vfsmount *vfs_submount(const struct dentry *mountpoint, - struct file_system_type *type, - const char *name, void *data); extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); -- cgit v1.2.3 From 20b4f0b4cdfec106bb73afd47de7d121b723f723 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 30 Apr 2025 00:45:57 +0100 Subject: rpmsg: core: Remove deadcode rpmsg_send_offchannel() and rpmsg_trysend_offchannel() have been unused since they were added in 2011's commit bcabbccabffe ("rpmsg: add virtio-based remote processor messaging bus") Remove them and associated docs. Signed-off-by: Dr. David Alan Gilbert Acked-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20250429234600.301083-2-linux@treblig.org Signed-off-by: Mathieu Poirier --- include/linux/rpmsg.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index 90d8e4475f80..fb7ab9165645 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -184,13 +184,9 @@ struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_device *, int rpmsg_send(struct rpmsg_endpoint *ept, void *data, int len); int rpmsg_sendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst); -int rpmsg_send_offchannel(struct rpmsg_endpoint *ept, u32 src, u32 dst, - void *data, int len); int rpmsg_trysend(struct rpmsg_endpoint *ept, void *data, int len); int rpmsg_trysendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst); -int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src, u32 dst, - void *data, int len); __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp, poll_table *wait); @@ -271,15 +267,6 @@ static inline int rpmsg_sendto(struct rpmsg_endpoint *ept, void *data, int len, } -static inline int rpmsg_send_offchannel(struct rpmsg_endpoint *ept, u32 src, - u32 dst, void *data, int len) -{ - /* This shouldn't be possible */ - WARN_ON(1); - - return -ENXIO; -} - static inline int rpmsg_trysend(struct rpmsg_endpoint *ept, void *data, int len) { /* This shouldn't be possible */ @@ -297,15 +284,6 @@ static inline int rpmsg_trysendto(struct rpmsg_endpoint *ept, void *data, return -ENXIO; } -static inline int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src, - u32 dst, void *data, int len) -{ - /* This shouldn't be possible */ - WARN_ON(1); - - return -ENXIO; -} - static inline __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp, poll_table *wait) { -- cgit v1.2.3 From a1ceb831417b8e23bd041d3e7021e235fa845128 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Apr 2025 08:55:41 +0200 Subject: genirq/manage: Rework can_request_irq() Use the new guards to get and lock the interrupt descriptor and tidy up the code. Make the return value boolean to reflect it's meaning. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250429065422.187250840@linutronix.de --- include/linux/irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index dd5df1e2d032..df93b238b8bf 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -700,7 +700,7 @@ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret); extern int noirqdebug_setup(char *str); /* Checks whether the interrupt can be requested by request_irq(): */ -extern int can_request_irq(unsigned int irq, unsigned long irqflags); +extern bool can_request_irq(unsigned int irq, unsigned long irqflags); /* Dummy irq-chip implementations: */ extern struct irq_chip no_irq_chip; -- cgit v1.2.3 From 66db876162155c1cec87359cd78c62aaafde9257 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 7 May 2025 11:21:22 +0200 Subject: bus: firewall: Fix missing static inline annotations for stubs Stubs in the header file for !CONFIG_STM32_FIREWALL case should be both static and inline, because they do not come with earlier declaration and should be inlined in every unit including the header. Cc: Patrice Chotard Cc: stable@vger.kernel.org Fixes: 5c9668cfc6d7 ("firewall: introduce stm32_firewall framework") Link: https://lore.kernel.org/r/20250507092121.95121-2-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/bus/stm32_firewall_device.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bus/stm32_firewall_device.h b/include/linux/bus/stm32_firewall_device.h index 5178b72bc920..eaa7a3f54450 100644 --- a/include/linux/bus/stm32_firewall_device.h +++ b/include/linux/bus/stm32_firewall_device.h @@ -114,27 +114,30 @@ void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 su #else /* CONFIG_STM32_FIREWALL */ -int stm32_firewall_get_firewall(struct device_node *np, struct stm32_firewall *firewall, - unsigned int nb_firewall) +static inline int stm32_firewall_get_firewall(struct device_node *np, + struct stm32_firewall *firewall, + unsigned int nb_firewall) { return -ENODEV; } -int stm32_firewall_grant_access(struct stm32_firewall *firewall) +static inline int stm32_firewall_grant_access(struct stm32_firewall *firewall) { return -ENODEV; } -void stm32_firewall_release_access(struct stm32_firewall *firewall) +static inline void stm32_firewall_release_access(struct stm32_firewall *firewall) { } -int stm32_firewall_grant_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id) +static inline int stm32_firewall_grant_access_by_id(struct stm32_firewall *firewall, + u32 subsystem_id) { return -ENODEV; } -void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id) +static inline void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, + u32 subsystem_id) { } -- cgit v1.2.3 From aefc11550ebd08eadee6d643792c9092de2e472f Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 20 Apr 2025 17:46:56 +0100 Subject: genirq: Remove unused remove_percpu_irq() remove_percpu_irq() has been unused since it was added in 2011 by commit 31d9d9b6d830 ("genirq: Add support for per-cpu dev_id interrupts") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250420164656.112641-1-linux@treblig.org --- include/linux/irq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index df93b238b8bf..810e44ee297f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -597,7 +597,6 @@ enum { struct irqaction; extern int setup_percpu_irq(unsigned int irq, struct irqaction *new); -extern void remove_percpu_irq(unsigned int irq, struct irqaction *act); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE extern void irq_cpu_online(void); -- cgit v1.2.3 From d683a8561889c1813fe2ad6082769c91e3cb71b3 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Wed, 30 Apr 2025 16:27:09 +0000 Subject: ubsan: Remove regs from report_ubsan_failure() report_ubsan_failure() doesn't use argument regs, and soon it will be called from the hypervisor context were regs are not available. So, remove the unused argument. Signed-off-by: Mostafa Saleh Acked-by: Kees Cook Link: https://lore.kernel.org/r/20250430162713.1997569-3-smostafa@google.com Signed-off-by: Marc Zyngier --- include/linux/ubsan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ubsan.h b/include/linux/ubsan.h index d8219cbe09ff..c843816f5f68 100644 --- a/include/linux/ubsan.h +++ b/include/linux/ubsan.h @@ -3,9 +3,9 @@ #define _LINUX_UBSAN_H #ifdef CONFIG_UBSAN_TRAP -const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type); +const char *report_ubsan_failure(u32 check_type); #else -static inline const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) +static inline const char *report_ubsan_failure(u32 check_type) { return NULL; } -- cgit v1.2.3 From 61b38f7591fb434fce326c1d686a9793c7f418bc Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Wed, 30 Apr 2025 16:27:10 +0000 Subject: KVM: arm64: Introduce CONFIG_UBSAN_KVM_EL2 Add a new Kconfig CONFIG_UBSAN_KVM_EL2 for KVM which enables UBSAN for EL2 code (in protected/nvhe/hvhe) modes. This will re-use the same checks enabled for the kernel for the hypervisor. The only difference is that for EL2 it always emits a "brk" instead of implementing hooks as the hypervisor can't print reports. The KVM code will re-use the same code for the kernel "report_ubsan_failure()" so #ifdefs are changed to also have this code for CONFIG_UBSAN_KVM_EL2 Signed-off-by: Mostafa Saleh Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250430162713.1997569-4-smostafa@google.com Signed-off-by: Marc Zyngier --- include/linux/ubsan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ubsan.h b/include/linux/ubsan.h index c843816f5f68..3ab8d38aedb8 100644 --- a/include/linux/ubsan.h +++ b/include/linux/ubsan.h @@ -2,7 +2,7 @@ #ifndef _LINUX_UBSAN_H #define _LINUX_UBSAN_H -#ifdef CONFIG_UBSAN_TRAP +#if defined(CONFIG_UBSAN_TRAP) || defined(CONFIG_UBSAN_KVM_EL2) const char *report_ubsan_failure(u32 check_type); #else static inline const char *report_ubsan_failure(u32 check_type) -- cgit v1.2.3 From f31acff017b1d80e649f674450d3b64d3265848c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 7 May 2025 17:25:37 +0800 Subject: block: fix warning on 'make htmldocs' Fix the following warning when running 'make htmldocs': +WARNING: include/linux/blk-mq.h:532 struct member 'update_nr_hwq_lock' not described in 'blk_mq_tag_set' Fixes: 98e68f67020c ("block: prevent adding/deleting disk during updating nr_hw_queues") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20250507163220.00141d77@canb.auug.org.au/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250507092537.3009112-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ef84d53095a6..906bf330ffb5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -507,6 +507,9 @@ enum hctx_type { * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). + * @update_nr_hwq_lock: + * Synchronize updating nr_hw_queues with add/del disk & + * switching elevator. */ struct blk_mq_tag_set { const struct blk_mq_ops *ops; -- cgit v1.2.3 From 850e210d5ad21b94b55b97d4d82b4cdeb0bb05df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:25 +0200 Subject: block: add a bio_add_virt_nofail helper Add a helper to add a directly mapped kernel virtual address to a bio so that callers don't have to convert to pages or folios. For now only the _nofail variant is provided as that is what all the obvious callers want. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index cafc7c215de8..acca7464080c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -417,6 +417,8 @@ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); +void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len); + int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); -- cgit v1.2.3 From 10b1e59cdadabff16fc78eb2ca4c341b1502293c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:26 +0200 Subject: block: add a bdev_rw_virt helper Add a helper to perform synchronous I/O on a kernel direct map range. Currently this is implemented in various places in usually not very efficient ways, so provide a generic helper instead. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index acca7464080c..ad54e6af20dc 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -402,7 +402,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) struct request_queue; -extern int submit_bio_wait(struct bio *bio); void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf); extern void bio_uninit(struct bio *); @@ -419,6 +418,10 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len); +int submit_bio_wait(struct bio *bio); +int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, + size_t len, enum req_op op); + int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); -- cgit v1.2.3 From 75f88659e47dc570bdebddf77d7a3cd5f0845612 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:27 +0200 Subject: block: add a bio_add_max_vecs helper Add a helper to check how many bio_vecs are needed to add a kernel virtual address range to a bio, accounting for the always contiguous direct mapping and vmalloc mappings that usually need a bio_vec per page sized chunk. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index ad54e6af20dc..128b1c6ca648 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -418,6 +418,21 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len); +/** + * bio_add_max_vecs - number of bio_vecs needed to add data to a bio + * @kaddr: kernel virtual address to add + * @len: length in bytes to add + * + * Calculate how many bio_vecs need to be allocated to add the kernel virtual + * address range in [@kaddr:@len] in the worse case. + */ +static inline unsigned int bio_add_max_vecs(void *kaddr, unsigned int len) +{ + if (is_vmalloc_addr(kaddr)) + return DIV_ROUND_UP(offset_in_page(kaddr) + len, PAGE_SIZE); + return 1; +} + int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -- cgit v1.2.3 From 8dd16f5e34693aa0aa6a4ed48427045008097e64 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:28 +0200 Subject: block: add a bio_add_vmalloc helpers Add a helper to add a vmalloc region to a bio, abstracting away the vmalloc addresses from the underlying pages and another one wrapping it for the simple case where all data fits into a single bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 128b1c6ca648..5d880903bcc5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -433,6 +433,9 @@ static inline unsigned int bio_add_max_vecs(void *kaddr, unsigned int len) return 1; } +unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len); +bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len); + int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -- cgit v1.2.3 From af78428ed3f3eebad7be9d0463251046e9582cf6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 May 2025 14:04:29 +0200 Subject: block: remove the q argument from blk_rq_map_kern Remove the q argument from blk_rq_map_kern and the internal helpers called by it as the queue can trivially be derived from the request. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250507120451.4000627-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 906bf330ffb5..de8c85a03bb7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1037,8 +1037,8 @@ int blk_rq_map_user_io(struct request *, struct rq_map_data *, int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); int blk_rq_unmap_user(struct bio *); -int blk_rq_map_kern(struct request_queue *, struct request *, void *, - unsigned int, gfp_t); +int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len, + gfp_t gfp); int blk_rq_append_bio(struct request *rq, struct bio *bio); void blk_execute_rq_nowait(struct request *rq, bool at_head); blk_status_t blk_execute_rq(struct request *rq, bool at_head); -- cgit v1.2.3 From b8c7bfb7a0f01521297d688ca5fe1482c81e8910 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 14 Apr 2025 14:30:56 -0400 Subject: irqdomain: Add IRQ_DOMAIN_FLAG_MSI_IMMUTABLE and irq_domain_is_msi_immutable() Add the flag IRQ_DOMAIN_FLAG_MSI_IMMUTABLE and the API function irq_domain_is_msi_immutable() to check if the MSI controller retains an immutable address/data pair during irq_set_affinity(). Ensure compatibility with MSI users like PCIe Endpoint Doorbell, which require the address/data pair to remain unchanged after setup. Use this function to verify if the MSI controller is immutable. Signed-off-by: Frank Li Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250414-ep-msi-v18-2-f69b49917464@nxp.com --- include/linux/irqdomain.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index bb7111105296..bccfff5bebe2 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -231,6 +231,9 @@ enum { /* Irq domain must destroy generic chips when removed */ IRQ_DOMAIN_FLAG_DESTROY_GC = (1 << 10), + /* Address and data pair is mutable when irq_set_affinity() */ + IRQ_DOMAIN_FLAG_MSI_IMMUTABLE = (1 << 11), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -691,6 +694,10 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain) return domain->flags & IRQ_DOMAIN_FLAG_MSI_DEVICE; } +static inline bool irq_domain_is_msi_immutable(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_MSI_IMMUTABLE; +} #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) -- cgit v1.2.3 From f42c8556a0690802246dc588ae9c18184f71d8f5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 May 2025 22:34:31 +0200 Subject: cpufreq/sched: schedutil: Add helper for governor checks Add a helper for checking if schedutil is the current governor for a given cpufreq policy and use it in sched_is_eas_possible() to avoid accessing cpufreq policy internals directly from there. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Reviewed-by: Dietmar Eggemann Link: https://patch.msgid.link/3365956.44csPzL39Z@rjwysocki.net --- include/linux/cpufreq.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7bb760031dd7..1d2c6c6d8952 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -650,6 +650,15 @@ module_exit(__governor##_exit) struct cpufreq_governor *cpufreq_default_governor(void); struct cpufreq_governor *cpufreq_fallback_governor(void); +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +bool sugov_is_governor(struct cpufreq_policy *policy); +#else +static inline bool sugov_is_governor(struct cpufreq_policy *policy) +{ + return false; +} +#endif + static inline void cpufreq_policy_apply_limits(struct cpufreq_policy *policy) { if (policy->max < policy->cur) -- cgit v1.2.3 From 4854649b1fb43968615e0374d9d59580093ac67f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 May 2025 22:37:15 +0200 Subject: cpufreq/sched: Move cpufreq-specific EAS checks to cpufreq Doing cpufreq-specific EAS checks that require accessing policy internals directly from sched_is_eas_possible() is a bit unfortunate, so introduce cpufreq_ready_for_eas() in cpufreq, move those checks into that new function and make sched_is_eas_possible() call it. While at it, address a possible race between the EAS governor check and governor change by doing the former under the policy rwsem. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Reviewed-by: Dietmar Eggemann Link: https://patch.msgid.link/2317800.iZASKD2KPV@rjwysocki.net --- include/linux/cpufreq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1d2c6c6d8952..95f3807c8c55 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1237,6 +1237,8 @@ void cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); +bool cpufreq_ready_for_eas(const struct cpumask *cpu_mask); + static inline void cpufreq_register_em_with_opp(struct cpufreq_policy *policy) { dev_pm_opp_of_register_em(get_cpu_device(policy->cpu), -- cgit v1.2.3 From 6bceea7a1e076ef9d71b20d8dda2f7dc52bd34d2 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 18 Apr 2025 19:55:03 -0700 Subject: arch_topology: Relocate cpu_scale to topology.[h|c] arch_topology.c provides functionality to parse and scale CPU capacity. It also provides a corresponding sysfs interface. Some architectures parse and scale CPU capacity differently as per their own needs. On Intel processors, for instance, it is responsibility of the Intel P-state driver. Relocate the implementation of that interface to a common location in topology.c. Architectures can use the interface and populate it using their own mechanisms. An alternative approach would be to compile arch_topology.c even if not needed only to get this interface. This approach would create duplicated and conflicting functionality and data structures. Signed-off-by: Ricardo Neri Tested-by: Christian Loehle Link: https://patch.msgid.link/20250419025504.9760-2-ricardo.neri-calderon@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/arch_topology.h | 8 -------- include/linux/topology.h | 9 +++++++++ 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 2222e8b03ff4..d72d6e5aa200 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -14,14 +14,6 @@ int topology_update_cpu_topology(void); struct device_node; bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu); -DECLARE_PER_CPU(unsigned long, cpu_scale); - -static inline unsigned long topology_get_cpu_scale(int cpu) -{ - return per_cpu(cpu_scale, cpu); -} - -void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); DECLARE_PER_CPU(unsigned long, capacity_freq_ref); diff --git a/include/linux/topology.h b/include/linux/topology.h index 24e715f0f6d2..cd6b4bdc9cfd 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -332,4 +332,13 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops) !IS_ERR_OR_NULL(mask); \ __hops++) +DECLARE_PER_CPU(unsigned long, cpu_scale); + +static inline unsigned long topology_get_cpu_scale(int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); + #endif /* _LINUX_TOPOLOGY_H */ -- cgit v1.2.3 From 5d894321c49e61379189b0ff605f316e39cbd1e9 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 7 May 2025 14:18:21 -0700 Subject: fs: add atomic write unit max opt to statx XFS will be able to support large atomic writes (atomic write > 1x block) in future. This will be achieved by using different operating methods, depending on the size of the write. Specifically a new method of operation based in FS atomic extent remapping will be supported in addition to the current HW offload-based method. The FS method will generally be appreciably slower performing than the HW-offload method. However the FS method will be typically able to contribute to achieving a larger atomic write unit max limit. XFS will support a hybrid mode, where HW offload method will be used when possible, i.e. HW offload is used when the length of the write is supported, and for other times FS-based atomic writes will be used. As such, there is an atomic write length at which the user may experience appreciably slower performance. Advertise this limit in a new statx field, stx_atomic_write_unit_max_opt. When zero, it means that there is no such performance boundary. Masks STATX{_ATTR}_WRITE_ATOMIC can be used to get this new field. This is ok for older kernels which don't support this new field, as they would report 0 in this field (from zeroing in cp_statx()) already. Furthermore those older kernels don't support large atomic writes - apart from block fops, but there would be consistent performance there for atomic writes in range [unit min, unit max]. Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Acked-by: Darrick J. Wong Signed-off-by: John Garry --- include/linux/fs.h | 3 ++- include/linux/stat.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..7b19d8f99aff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3475,7 +3475,8 @@ void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, - unsigned int unit_max); + unsigned int unit_max, + unsigned int unit_max_opt); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); diff --git a/include/linux/stat.h b/include/linux/stat.h index be7496a6a0dd..e3d00e7bb26d 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -57,6 +57,7 @@ struct kstat { u32 dio_read_offset_align; u32 atomic_write_unit_min; u32 atomic_write_unit_max; + u32 atomic_write_unit_max_opt; u32 atomic_write_segments_max; }; -- cgit v1.2.3 From 5017b1b02640234b08ad38a046043f143670dea2 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 18 Mar 2025 20:41:42 -0500 Subject: ipmi: Add a note about the pretimeout callback You can't do IPMI calls from the callback, it's called with locks held. Signed-off-by: Corey Minyard --- include/linux/ipmi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 2f74dd90c271..27cd5980bb27 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -93,7 +93,8 @@ struct ipmi_user_hndl { /* * Called when the interface detects a watchdog pre-timeout. If - * this is NULL, it will be ignored for the user. + * this is NULL, it will be ignored for the user. Note that you + * can't do any IPMI calls from here, it's called with locks held. */ void (*ipmi_watchdog_pretimeout)(void *handler_data); -- cgit v1.2.3 From 6f7f6605c9aeb472b5d4101b263b1fc5dc3cf623 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 22 Apr 2025 12:06:02 -0500 Subject: ipmi:msghandler: Export and fix panic messaging capability Don't have the other users that do things at panic time (the watchdog) do all this themselves, provide a function to do it. Also, with the new design where most stuff happens at thread context, a few things needed to be fixed to avoid doing locking in a panic context. Signed-off-by: Corey Minyard --- include/linux/ipmi.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 27cd5980bb27..7da6602eab71 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -344,4 +344,14 @@ extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data); /* Helper function for computing the IPMB checksum of some data. */ unsigned char ipmb_checksum(unsigned char *data, int size); +/* + * For things that must send messages at panic time, like the IPMI watchdog + * driver that extends the reset time on a panic, use this to send messages + * from panic context. Note that this puts the driver into a mode that + * only works at panic time, so only use it then. + */ +void ipmi_panic_request_and_wait(struct ipmi_user *user, + struct ipmi_addr *addr, + struct kernel_ipmi_msg *msg); + #endif /* __LINUX_IPMI_H */ -- cgit v1.2.3 From 59529bbe642de4eb2191a541d9b4bae7eb73862e Mon Sep 17 00:00:00 2001 From: Huang Yiwei Date: Wed, 7 May 2025 12:57:57 +0800 Subject: firmware: SDEI: Allow sdei initialization without ACPI_APEI_GHES SDEI usually initialize with the ACPI table, but on platforms where ACPI is not used, the SDEI feature can still be used to handle specific firmware calls or other customized purposes. Therefore, it is not necessary for ARM_SDE_INTERFACE to depend on ACPI_APEI_GHES. In commit dc4e8c07e9e2 ("ACPI: APEI: explicit init of HEST and GHES in acpi_init()"), to make APEI ready earlier, sdei_init was moved into acpi_ghes_init instead of being a standalone initcall, adding ACPI_APEI_GHES dependency to ARM_SDE_INTERFACE. This restricts the flexibility and usability of SDEI. This patch corrects the dependency in Kconfig and splits sdei_init() into two separate functions: sdei_init() and acpi_sdei_init(). sdei_init() will be called by arch_initcall and will only initialize the platform driver, while acpi_sdei_init() will initialize the device from acpi_ghes_init() when ACPI is ready. This allows the initialization of SDEI without ACPI_APEI_GHES enabled. Fixes: dc4e8c07e9e2 ("ACPI: APEI: explicit init of HEST and GHES in apci_init()") Cc: Shuai Xue Signed-off-by: Huang Yiwei Reviewed-by: Shuai Xue Reviewed-by: Gavin Shan Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20250507045757.2658795-1-quic_hyiwei@quicinc.com Signed-off-by: Will Deacon --- include/linux/arm_sdei.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h index 255701e1251b..f652a5028b59 100644 --- a/include/linux/arm_sdei.h +++ b/include/linux/arm_sdei.h @@ -46,12 +46,12 @@ int sdei_unregister_ghes(struct ghes *ghes); /* For use by arch code when CPU hotplug notifiers are not appropriate. */ int sdei_mask_local_cpu(void); int sdei_unmask_local_cpu(void); -void __init sdei_init(void); +void __init acpi_sdei_init(void); void sdei_handler_abort(void); #else static inline int sdei_mask_local_cpu(void) { return 0; } static inline int sdei_unmask_local_cpu(void) { return 0; } -static inline void sdei_init(void) { } +static inline void acpi_sdei_init(void) { } static inline void sdei_handler_abort(void) { } #endif /* CONFIG_ARM_SDE_INTERFACE */ -- cgit v1.2.3 From 812bca7f7e73778a7022f862dc8c7c7d38b9a760 Mon Sep 17 00:00:00 2001 From: Xi Pardee Date: Fri, 25 Apr 2025 12:52:29 -0700 Subject: platform/x86:intel/vsec: Change return type of intel_vsec_register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change return type of intel_vsec_register() to int. The current implementation does not indicate if the register fail or not. Change to return error code if it fails or if INTEL_VSEC config is not set. This is a preparation step to introduce a new SSRAM Telemetry driver that will be using this API. Signed-off-by: Xi Pardee Link: https://lore.kernel.org/r/20250425195237.493129-2-xi.pardee@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/intel_vsec.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index b94beab64610..bc95821f1bfb 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -139,12 +139,13 @@ static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device } #if IS_ENABLED(CONFIG_INTEL_VSEC) -void intel_vsec_register(struct pci_dev *pdev, +int intel_vsec_register(struct pci_dev *pdev, struct intel_vsec_platform_info *info); #else -static inline void intel_vsec_register(struct pci_dev *pdev, +static inline int intel_vsec_register(struct pci_dev *pdev, struct intel_vsec_platform_info *info) { + return -ENODEV; } #endif #endif -- cgit v1.2.3 From feea7bd6b02d43a794e3f065650d89cf8d8e8e59 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 23 Mar 2025 15:34:21 +1300 Subject: platform/x86: asus-wmi: Refactor Ally suspend/resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adjust how the CSEE direct call hack is used. The results of months of testing combined with help from ASUS to determine the actual cause of suspend issues has resulted in this refactoring which immensely improves the reliability for devices which do not have the following minimum MCU FW version: - ROG Ally X: 313 - ROG Ally 1: 319 For MCU FW versions that match the minimum or above the CSEE hack is disabled and mcu_powersave set to on by default as there are no negatives beyond a slightly slower device reinitialization due to the MCU being powered off. As this is set only at module load time, it is still possible for mcu_powersave sysfs attributes to change it at runtime if so desired. Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250323023421.78012-3-luke@ljones.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 783e2a336861..8a515179113d 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -157,9 +157,28 @@ #define ASUS_WMI_DSTS_MAX_BRIGTH_MASK 0x0000FF00 #define ASUS_WMI_DSTS_LIGHTBAR_MASK 0x0000000F +enum asus_ally_mcu_hack { + ASUS_WMI_ALLY_MCU_HACK_INIT, + ASUS_WMI_ALLY_MCU_HACK_ENABLED, + ASUS_WMI_ALLY_MCU_HACK_DISABLED, +}; + #if IS_REACHABLE(CONFIG_ASUS_WMI) +void set_ally_mcu_hack(enum asus_ally_mcu_hack status); +void set_ally_mcu_powersave(bool enabled); +int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval); int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval); #else +static inline void set_ally_mcu_hack(enum asus_ally_mcu_hack status) +{ +} +static inline void set_ally_mcu_powersave(bool enabled) +{ +} +static inline int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval) +{ + return -ENODEV; +} static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval) { -- cgit v1.2.3 From 9950f94e485908fc42e7bb2533aff13b7e97a36c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 5 May 2025 16:25:58 +0100 Subject: platform/x86/sony-laptop: Remove unused sony laptop camera code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ba47652ba655 ("media: meye: remove this deprecated driver") removed the meye driver but left behind the code in sony-laptop.c which that driver used to call. Remove the sony_pic_camera_command() function, and the set of defines (SONY_PIC_COMMAND_*) in a header used for the interface and the static helpers it called. Cleanup remaining #defines. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250505152558.40526-1-linux@treblig.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/sony-laptop.h | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 include/linux/sony-laptop.h (limited to 'include/linux') diff --git a/include/linux/sony-laptop.h b/include/linux/sony-laptop.h deleted file mode 100644 index 1e3c92feea6e..000000000000 --- a/include/linux/sony-laptop.h +++ /dev/null @@ -1,39 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SONYLAPTOP_H_ -#define _SONYLAPTOP_H_ - -#include - -#ifdef __KERNEL__ - -/* used only for communication between v4l and sony-laptop */ - -#define SONY_PIC_COMMAND_GETCAMERA 1 /* obsolete */ -#define SONY_PIC_COMMAND_SETCAMERA 2 -#define SONY_PIC_COMMAND_GETCAMERABRIGHTNESS 3 /* obsolete */ -#define SONY_PIC_COMMAND_SETCAMERABRIGHTNESS 4 -#define SONY_PIC_COMMAND_GETCAMERACONTRAST 5 /* obsolete */ -#define SONY_PIC_COMMAND_SETCAMERACONTRAST 6 -#define SONY_PIC_COMMAND_GETCAMERAHUE 7 /* obsolete */ -#define SONY_PIC_COMMAND_SETCAMERAHUE 8 -#define SONY_PIC_COMMAND_GETCAMERACOLOR 9 /* obsolete */ -#define SONY_PIC_COMMAND_SETCAMERACOLOR 10 -#define SONY_PIC_COMMAND_GETCAMERASHARPNESS 11 /* obsolete */ -#define SONY_PIC_COMMAND_SETCAMERASHARPNESS 12 -#define SONY_PIC_COMMAND_GETCAMERAPICTURE 13 /* obsolete */ -#define SONY_PIC_COMMAND_SETCAMERAPICTURE 14 -#define SONY_PIC_COMMAND_GETCAMERAAGC 15 /* obsolete */ -#define SONY_PIC_COMMAND_SETCAMERAAGC 16 -#define SONY_PIC_COMMAND_GETCAMERADIRECTION 17 /* obsolete */ -#define SONY_PIC_COMMAND_GETCAMERAROMVERSION 18 /* obsolete */ -#define SONY_PIC_COMMAND_GETCAMERAREVISION 19 /* obsolete */ - -#if IS_ENABLED(CONFIG_SONY_LAPTOP) -int sony_pic_camera_command(int command, u8 value); -#else -static inline int sony_pic_camera_command(int command, u8 value) { return 0; } -#endif - -#endif /* __KERNEL__ */ - -#endif /* _SONYLAPTOP_H_ */ -- cgit v1.2.3 From 88cefd99ee265b18f851b911a75282146ecabc3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Apr 2025 15:38:30 -0400 Subject: ftrace: Show subops in enabled_functions The function graph infrastructure uses subops of the function tracer. These are not shown in enabled_functions. Add a "subops:" section to the enabled_functions line to show what functions are attached via subops. If the subops is from the function_graph infrastructure, then show the entry and return callbacks that are attached. Here's an example of the output: schedule_on_each_cpu (1) tramp: 0xffffffffc03ef000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 subops: {ent:trace_graph_entry+0x0/0x20 ret:trace_graph_return+0x0/0x150} Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250410153830.5d97f108@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fbabc3d848b3..fc939ca2ff66 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -328,6 +328,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * DIRECT - Used by the direct ftrace_ops helper for direct functions * (internal ftrace only, should not be used by others) * SUBOP - Is controlled by another op in field managed. + * GRAPH - Is a component of the fgraph_ops structure */ enum { FTRACE_OPS_FL_ENABLED = BIT(0), @@ -349,6 +350,7 @@ enum { FTRACE_OPS_FL_PERMANENT = BIT(16), FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), + FTRACE_OPS_FL_GRAPH = BIT(19), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS -- cgit v1.2.3 From 53eddae9af0c0b46f9c77a02d23c21c1aa824739 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 7 May 2025 20:47:32 +0200 Subject: platform/x86: int3472: Move common.h to public includes, symbols to INTEL_INT3472 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the common.h header file to include/linux/platform_data/x86/int3472.h and add a "INTEL_INT3472" kernel-symbol-namespace to the exported symbols. This is a preparation patch for exporting some more symbols for re-use in the atomisp driver. Signed-off-by: Hans de Goede Reviewed-by: Sakari Ailus Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250507184737.154747-2-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 164 ++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 include/linux/platform_data/x86/int3472.h (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h new file mode 100644 index 000000000000..4cf02df6f753 --- /dev/null +++ b/include/linux/platform_data/x86/int3472.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Intel INT3472 ACPI camera sensor power-management support + * + * Author: Dan Scally + */ + +#ifndef __PLATFORM_DATA_X86_INT3472_H +#define __PLATFORM_DATA_X86_INT3472_H + +#include +#include +#include +#include +#include +#include + +/* FIXME drop this once the I2C_DEV_NAME_FORMAT macro has been added to include/linux/i2c.h */ +#ifndef I2C_DEV_NAME_FORMAT +#define I2C_DEV_NAME_FORMAT "i2c-%s" +#endif + +/* PMIC GPIO Types */ +#define INT3472_GPIO_TYPE_RESET 0x00 +#define INT3472_GPIO_TYPE_POWERDOWN 0x01 +#define INT3472_GPIO_TYPE_POWER_ENABLE 0x0b +#define INT3472_GPIO_TYPE_CLK_ENABLE 0x0c +#define INT3472_GPIO_TYPE_PRIVACY_LED 0x0d +#define INT3472_GPIO_TYPE_HANDSHAKE 0x12 + +#define INT3472_PDEV_MAX_NAME_LEN 23 +#define INT3472_MAX_SENSOR_GPIOS 3 +#define INT3472_MAX_REGULATORS 3 + +/* E.g. "avdd\0" */ +#define GPIO_SUPPLY_NAME_LENGTH 5 +/* 12 chars for acpi_dev_name() + "-", e.g. "ABCD1234:00-" */ +#define GPIO_REGULATOR_NAME_LENGTH (12 + GPIO_SUPPLY_NAME_LENGTH) +/* lower- and upper-case mapping */ +#define GPIO_REGULATOR_SUPPLY_MAP_COUNT 2 +/* + * Ensure the GPIO is driven low/high for at least 2 ms before changing. + * + * 2 ms has been chosen because it is the minimum time ovXXXX sensors need to + * have their reset line driven logical high to properly register a reset. + */ +#define GPIO_REGULATOR_ENABLE_TIME (2 * USEC_PER_MSEC) +#define GPIO_REGULATOR_OFF_ON_DELAY (2 * USEC_PER_MSEC) + +#define INT3472_LED_MAX_NAME_LEN 32 + +#define CIO2_SENSOR_SSDB_MCLKSPEED_OFFSET 86 + +#define INT3472_REGULATOR(_name, _ops, _enable_time, _off_on_delay) \ + (const struct regulator_desc) { \ + .name = _name, \ + .type = REGULATOR_VOLTAGE, \ + .ops = _ops, \ + .owner = THIS_MODULE, \ + .enable_time = _enable_time, \ + .off_on_delay = _off_on_delay, \ + } + +#define to_int3472_clk(hw) \ + container_of(hw, struct int3472_clock, clk_hw) + +#define to_int3472_device(clk) \ + container_of(clk, struct int3472_discrete_device, clock) + +struct acpi_device; +struct dmi_system_id; +struct i2c_client; +struct platform_device; + +struct int3472_cldb { + u8 version; + /* + * control logic type + * 0: UNKNOWN + * 1: DISCRETE(CRD-D) + * 2: PMIC TPS68470 + * 3: PMIC uP6641 + */ + u8 control_logic_type; + u8 control_logic_id; + u8 sensor_card_sku; + u8 reserved[10]; + u8 clock_source; + u8 reserved2[17]; +}; + +struct int3472_discrete_quirks { + /* For models where AVDD GPIO is shared between sensors */ + const char *avdd_second_sensor; +}; + +struct int3472_gpio_regulator { + /* SUPPLY_MAP_COUNT * 2 to make room for second sensor mappings */ + struct regulator_consumer_supply supply_map[GPIO_REGULATOR_SUPPLY_MAP_COUNT * 2]; + char supply_name_upper[GPIO_SUPPLY_NAME_LENGTH]; + char regulator_name[GPIO_REGULATOR_NAME_LENGTH]; + struct regulator_dev *rdev; + struct regulator_desc rdesc; +}; + +struct int3472_discrete_device { + struct acpi_device *adev; + struct device *dev; + struct acpi_device *sensor; + const char *sensor_name; + + const struct int3472_sensor_config *sensor_config; + + struct int3472_gpio_regulator regulators[INT3472_MAX_REGULATORS]; + + struct int3472_clock { + struct clk *clk; + struct clk_hw clk_hw; + struct clk_lookup *cl; + struct gpio_desc *ena_gpio; + u32 frequency; + u8 imgclk_index; + } clock; + + struct int3472_pled { + struct led_classdev classdev; + struct led_lookup_data lookup; + char name[INT3472_LED_MAX_NAME_LEN]; + struct gpio_desc *gpio; + } pled; + + struct int3472_discrete_quirks quirks; + + unsigned int ngpios; /* how many GPIOs have we seen */ + unsigned int n_sensor_gpios; /* how many have we mapped to sensor */ + unsigned int n_regulator_gpios; /* how many have we mapped to a regulator */ + struct gpiod_lookup_table gpios; +}; + +extern const struct dmi_system_id skl_int3472_discrete_quirks[]; + +union acpi_object *skl_int3472_get_acpi_buffer(struct acpi_device *adev, + char *id); +int skl_int3472_fill_cldb(struct acpi_device *adev, struct int3472_cldb *cldb); +int skl_int3472_get_sensor_adev_and_name(struct device *dev, + struct acpi_device **sensor_adev_ret, + const char **name_ret); + +int skl_int3472_register_gpio_clock(struct int3472_discrete_device *int3472, + struct gpio_desc *gpio); +int skl_int3472_register_dsm_clock(struct int3472_discrete_device *int3472); +void skl_int3472_unregister_clock(struct int3472_discrete_device *int3472); + +int skl_int3472_register_regulator(struct int3472_discrete_device *int3472, + struct gpio_desc *gpio, + unsigned int enable_time, + const char *supply_name, + const char *second_sensor); +void skl_int3472_unregister_regulator(struct int3472_discrete_device *int3472); + +int skl_int3472_register_pled(struct int3472_discrete_device *int3472, struct gpio_desc *gpio); +void skl_int3472_unregister_pled(struct int3472_discrete_device *int3472); + +#endif -- cgit v1.2.3 From 1e5d088a52c207bcef6a43a6f6ffe162c514ed64 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 7 May 2025 20:47:33 +0200 Subject: platform/x86: int3472: Stop using devm_gpiod_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intent is to re-use the INT3472 code for parsing Intel camera sensor GPIOs and mapping them to the sensor for the atomisp driver, which currently has duplicate code. On atomisp devices there is no special INT3472 ACPI device, instead the Intel _DSM to get the GPIO type is part of the ACPI device for the sensor itself. To deal with this the mapping is done from ipu_bridge_init() instead of from a platform-device probe() function, there is no device to tie the lifetime of the gpiod_get() calls done by the INT3472 code to. Switch from devm_gpiod_get() to plain gpiod_get() + explicit gpiod_put() calls, to prepare for the code being re-used in the atomisp driver. Signed-off-by: Hans de Goede Reviewed-by: Sakari Ailus Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250507184737.154747-3-hdegoede@redhat.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index 4cf02df6f753..0a835cc85c67 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -99,6 +99,7 @@ struct int3472_gpio_regulator { struct regulator_consumer_supply supply_map[GPIO_REGULATOR_SUPPLY_MAP_COUNT * 2]; char supply_name_upper[GPIO_SUPPLY_NAME_LENGTH]; char regulator_name[GPIO_REGULATOR_NAME_LENGTH]; + struct gpio_desc *ena_gpio; struct regulator_dev *rdev; struct regulator_desc rdesc; }; -- cgit v1.2.3 From 1cfa1bb9b4033e51d3c3f5ed5bf55475e57cc686 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 7 May 2025 20:47:34 +0200 Subject: platform/x86: int3472: Export int3472_discrete_parse_crs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment the atomisp has duplicate code for parsing Intel camera sensor GPIOS and calling the special 79234640-9e10-4fea-a5c1-b5aa8b19756f _DSM to get the GPIO type and map it to the sensor. Export int3472_discrete_parse_crs() so that the atomisp driver can reuse the INT3472 code for this. Signed-off-by: Hans de Goede Reviewed-by: Sakari Ailus Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250507184737.154747-4-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index 0a835cc85c67..89410f0cb73a 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -147,6 +147,9 @@ int skl_int3472_get_sensor_adev_and_name(struct device *dev, struct acpi_device **sensor_adev_ret, const char **name_ret); +int int3472_discrete_parse_crs(struct int3472_discrete_device *int3472); +void int3472_discrete_cleanup(struct int3472_discrete_device *int3472); + int skl_int3472_register_gpio_clock(struct int3472_discrete_device *int3472, struct gpio_desc *gpio); int skl_int3472_register_dsm_clock(struct int3472_discrete_device *int3472); -- cgit v1.2.3 From 45adb05473aa4afd6ff19fd0c46c17a6294ae788 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 7 May 2025 20:47:35 +0200 Subject: platform/x86: int3472: Remove unused sensor_config struct member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sensor_config is not used anywhere and its struct int3472_sensor_config type also is not declared anywhere, drop it. Signed-off-by: Hans de Goede Reviewed-by: Sakari Ailus Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250507184737.154747-5-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/int3472.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index 89410f0cb73a..78276a11c48d 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -110,8 +110,6 @@ struct int3472_discrete_device { struct acpi_device *sensor; const char *sensor_name; - const struct int3472_sensor_config *sensor_config; - struct int3472_gpio_regulator regulators[INT3472_MAX_REGULATORS]; struct int3472_clock { -- cgit v1.2.3 From 190faecf64c54bb5f9d8f0ea2a6628078d0f2c83 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 22 Apr 2025 15:05:18 -0600 Subject: overflow: Add STACK_FLEX_ARRAY_SIZE() helper Add new STACK_FLEX_ARRAY_SIZE() helper to get the size of a flexible-array member defined using DEFINE_FLEX()/DEFINE_RAW_FLEX() at compile time. This is essentially the same as ARRAY_SIZE() but for on-stack flexible-array members. Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/83d53744e11c80eb3f03765238cbe648855f4168.1745355442.git.gustavoars@kernel.org Signed-off-by: Kees Cook --- include/linux/overflow.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 6ee67c20b575..f33d74dac06f 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -420,6 +420,8 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * flexible array member. * Use __struct_size(@name) to get compile-time size of it afterwards. * Use __member_size(@name->member) to get compile-time size of @name members. + * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of + * elements in array @member. */ #define DEFINE_RAW_FLEX(type, name, member, count) \ _DEFINE_FLEX(type, name, member, count, = {}) @@ -438,8 +440,21 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * flexible array member. * Use __struct_size(@NAME) to get compile-time size of it afterwards. * Use __member_size(@NAME->member) to get compile-time size of @NAME members. + * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of + * elements in array @member. */ #define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT) \ _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .obj.COUNTER = COUNT, }) +/** + * STACK_FLEX_ARRAY_SIZE() - helper macro for DEFINE_FLEX() family. + * Returns the number of elements in @array. + * + * @name: Name for a variable defined in DEFINE_RAW_FLEX()/DEFINE_FLEX(). + * @array: Name of the array member. + */ +#define STACK_FLEX_ARRAY_SIZE(name, array) \ + (__member_size((name)->array) / sizeof(*(name)->array) + \ + __must_be_array((name)->array)) + #endif /* __LINUX_OVERFLOW_H */ -- cgit v1.2.3 From 47e36ed7840661a9f7fb53554a1b04a5f8daffea Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 1 May 2025 18:44:43 -0600 Subject: overflow: Fix direct struct member initialization in _DEFINE_FLEX() Currently, to statically initialize the struct members of the `type` object created by _DEFINE_FLEX(), the internal `obj` member must be explicitly referenced at the call site. See: struct flex { int a; int b; struct foo flex_array[]; }; _DEFINE_FLEX(struct flex, instance, flex_array, FIXED_SIZE, = { .obj = { .a = 0, .b = 1, }, }); This leaks _DEFINE_FLEX() internal implementation details and make the helper harder to use and read. Fix this and allow for a more natural and intuitive C99 init-style: _DEFINE_FLEX(struct flex, instance, flex_array, FIXED_SIZE, = { .a = 0, .b = 1, }); Note that before these changes, the `initializer` argument was optional, but now it's required. Also, update "counter" member initialization in DEFINE_FLEX(). Fixes: 26dd68d293fd ("overflow: add DEFINE_FLEX() for on-stack allocs") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/aBQVeyKfLOkO9Yss@kspp Signed-off-by: Kees Cook --- include/linux/overflow.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index f33d74dac06f..7b7be27ca113 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -396,7 +396,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. - * @initializer: initializer expression (could be empty for no init). + * @initializer: Initializer expression (e.g., pass `= { }` at minimum). */ #define _DEFINE_FLEX(type, name, member, count, initializer...) \ _Static_assert(__builtin_constant_p(count), \ @@ -404,7 +404,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) union { \ u8 bytes[struct_size_t(type, member, count)]; \ type obj; \ - } name##_u initializer; \ + } name##_u = { .obj initializer }; \ type *name = (type *)&name##_u /** @@ -444,7 +444,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * elements in array @member. */ #define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT) \ - _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .obj.COUNTER = COUNT, }) + _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .COUNTER = COUNT, }) /** * STACK_FLEX_ARRAY_SIZE() - helper macro for DEFINE_FLEX() family. -- cgit v1.2.3 From 0cecd37daef3d57e6656c0023978d5ec2d7409c1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 3 May 2025 11:46:18 -0700 Subject: gcc-plugins: Force full rebuild when plugins change There was no dependency between the plugins changing and the rest of the kernel being built. This could cause strange behaviors as instrumentation could vary between targets depending on when they were built. Generate a new header file, gcc-plugins.h, any time the GCC plugins change. Include the header file in compiler-version.h when its associated feature name, GCC_PLUGINS, is defined. This will be picked up by fixdep and force rebuilds where needed. Add a generic "touch" kbuild command, which will be used again in a following patch. Add a "normalize_path" string helper to make the "TOUCH" output less ugly. Link: https://lore.kernel.org/r/20250503184623.2572355-1-kees@kernel.org Tested-by: Nicolas Schier Reviewed-by: Nicolas Schier Signed-off-by: Kees Cook --- include/linux/compiler-version.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-version.h b/include/linux/compiler-version.h index 573fa85b6c0c..5dba398a9412 100644 --- a/include/linux/compiler-version.h +++ b/include/linux/compiler-version.h @@ -12,3 +12,14 @@ * and add dependency on include/config/CC_VERSION_TEXT, which is touched * by Kconfig when the version string from the compiler changes. */ + +/* Additional tree-wide dependencies start here. */ + +/* + * If any of the GCC plugins change, we need to rebuild everything that + * was built with them, as they may have changed their behavior and those + * behaviors may need to be synchronized across all translation units. + */ +#ifdef GCC_PLUGINS +#include +#endif -- cgit v1.2.3 From 056000c471ea41db56de58f71b9fe727d6e68b9b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 3 May 2025 11:46:19 -0700 Subject: randstruct: Force full rebuild when seed changes While the randstruct GCC plugin was being rebuilt if the randstruct seed changed, Clang builds did not notice the change. This could result in differing struct layouts in a target depending on when it was built. Include the existing generated header file in compiler-version.h when its associated feature name, RANDSTRUCT, is defined. This will be picked up by fixdep and force rebuilds where needed. Link: https://lore.kernel.org/r/20250503184623.2572355-2-kees@kernel.org Reviewed-by: Nicolas Schier Tested-by: Nicolas Schier Signed-off-by: Kees Cook --- include/linux/compiler-version.h | 9 +++++++++ include/linux/vermagic.h | 1 - 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler-version.h b/include/linux/compiler-version.h index 5dba398a9412..4b9d39b37650 100644 --- a/include/linux/compiler-version.h +++ b/include/linux/compiler-version.h @@ -23,3 +23,12 @@ #ifdef GCC_PLUGINS #include #endif + +/* + * If the randstruct seed itself changes (whether for GCC plugins or + * Clang), the entire tree needs to be rebuilt since the randomization of + * structures may change between compilation units if not. + */ +#ifdef RANDSTRUCT +#include +#endif diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 939ceabcaf06..335c360d4f9b 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -33,7 +33,6 @@ #define MODULE_VERMAGIC_MODVERSIONS "" #endif #ifdef RANDSTRUCT -#include #define MODULE_RANDSTRUCT "RANDSTRUCT_" RANDSTRUCT_HASHED_SEED #else #define MODULE_RANDSTRUCT -- cgit v1.2.3 From 11bb1678e249e51cd748e8f91e5241b3ce71da3a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 3 May 2025 11:46:20 -0700 Subject: integer-wrap: Force full rebuild when .scl file changes Since the integer wrapping sanitizer's behavior depends on its associated .scl file, we must force a full rebuild if the file changes. If not, instrumentation may differ between targets based on when they were built. Generate a new header file, integer-wrap.h, any time the Clang .scl file changes. Include the header file in compiler-version.h when its associated feature name, INTEGER_WRAP, is defined. This will be picked up by fixdep and force rebuilds where needed. Acked-by: Justin Stitt Link: https://lore.kernel.org/r/20250503184623.2572355-3-kees@kernel.org Reviewed-by: Nicolas Schier Signed-off-by: Kees Cook --- include/linux/compiler-version.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-version.h b/include/linux/compiler-version.h index 4b9d39b37650..ac1665a98a15 100644 --- a/include/linux/compiler-version.h +++ b/include/linux/compiler-version.h @@ -32,3 +32,13 @@ #ifdef RANDSTRUCT #include #endif + +/* + * If any external changes affect Clang's integer wrapping sanitizer + * behavior, a full rebuild is needed as the coverage for wrapping types + * may have changed, which may impact the expected behaviors that should + * not differ between compilation units. + */ +#ifdef INTEGER_WRAP +#include +#endif -- cgit v1.2.3 From e86e43907f945e7f2e48d1c5c9105801ca2b11b7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 7 May 2025 19:53:30 +0200 Subject: timers: Rename init_timer_key() as timer_init_key() Move this API to the canonical timer_*() namespace. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250507175338.672442-3-mingo@kernel.org --- include/linux/timer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 10596d7c3a34..0c1c3aa723e5 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -67,7 +67,7 @@ /* * LOCKDEP and DEBUG timer interfaces. */ -void init_timer_key(struct timer_list *timer, +void timer_init_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); @@ -83,7 +83,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, const char *name, struct lock_class_key *key) { - init_timer_key(timer, func, flags, name, key); + timer_init_key(timer, func, flags, name, key); } #endif @@ -91,7 +91,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define __init_timer(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ - init_timer_key((_timer), (_fn), (_flags), #_timer, &__key);\ + timer_init_key((_timer), (_fn), (_flags), #_timer, &__key);\ } while (0) #define __init_timer_on_stack(_timer, _fn, _flags) \ @@ -102,7 +102,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, } while (0) #else #define __init_timer(_timer, _fn, _flags) \ - init_timer_key((_timer), (_fn), (_flags), NULL, NULL) + timer_init_key((_timer), (_fn), (_flags), NULL, NULL) #define __init_timer_on_stack(_timer, _fn, _flags) \ init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL) #endif -- cgit v1.2.3 From 7879d10de3318af0b7893d1efc9d7654cd4ac1a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 7 May 2025 19:53:31 +0200 Subject: timers: Rename init_timer_on_stack_key() as timer_init_key_on_stack() Move this API to the canonical timer_*() namespace. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250507175338.672442-4-mingo@kernel.org --- include/linux/timer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 0c1c3aa723e5..31127e8de010 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -72,12 +72,12 @@ void timer_init_key(struct timer_list *timer, const char *name, struct lock_class_key *key); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -extern void init_timer_on_stack_key(struct timer_list *timer, +extern void timer_init_key_on_stack(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); #else -static inline void init_timer_on_stack_key(struct timer_list *timer, +static inline void timer_init_key_on_stack(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, @@ -97,14 +97,14 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, #define __init_timer_on_stack(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ - init_timer_on_stack_key((_timer), (_fn), (_flags), \ + timer_init_key_on_stack((_timer), (_fn), (_flags), \ #_timer, &__key); \ } while (0) #else #define __init_timer(_timer, _fn, _flags) \ timer_init_key((_timer), (_fn), (_flags), NULL, NULL) #define __init_timer_on_stack(_timer, _fn, _flags) \ - init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL) + timer_init_key_on_stack((_timer), (_fn), (_flags), NULL, NULL) #endif /** -- cgit v1.2.3 From 9505215b6b3233d38c5ee65cfd47b6a5a36adab9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 7 May 2025 19:53:32 +0200 Subject: timers: Rename __init_timer() as __timer_init() Move this API to the canonical __timer_*() namespace. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250507175338.672442-5-mingo@kernel.org --- include/linux/timer.h | 6 +++--- include/linux/workqueue.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 31127e8de010..11e1fac18f0e 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -88,7 +88,7 @@ static inline void timer_init_key_on_stack(struct timer_list *timer, #endif #ifdef CONFIG_LOCKDEP -#define __init_timer(_timer, _fn, _flags) \ +#define __timer_init(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ timer_init_key((_timer), (_fn), (_flags), #_timer, &__key);\ @@ -101,7 +101,7 @@ static inline void timer_init_key_on_stack(struct timer_list *timer, #_timer, &__key); \ } while (0) #else -#define __init_timer(_timer, _fn, _flags) \ +#define __timer_init(_timer, _fn, _flags) \ timer_init_key((_timer), (_fn), (_flags), NULL, NULL) #define __init_timer_on_stack(_timer, _fn, _flags) \ timer_init_key_on_stack((_timer), (_fn), (_flags), NULL, NULL) @@ -118,7 +118,7 @@ static inline void timer_init_key_on_stack(struct timer_list *timer, * be used and must be balanced with a call to destroy_timer_on_stack(). */ #define timer_setup(timer, callback, flags) \ - __init_timer((timer), (callback), (flags)) + __timer_init((timer), (callback), (flags)) #define timer_setup_on_stack(timer, callback, flags) \ __init_timer_on_stack((timer), (callback), (flags)) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index b0dc957c3e56..985f69f8fb99 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -316,7 +316,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ - __init_timer(&(_work)->timer, \ + __timer_init(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) -- cgit v1.2.3 From 9a716ac6eaaa73599921fa081c99b67bef589171 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 7 May 2025 19:53:33 +0200 Subject: timers: Rename __init_timer_on_stack() as __timer_init_on_stack() Move this API to the canonical __timer_*() namespace. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250507175338.672442-6-mingo@kernel.org --- include/linux/timer.h | 6 +++--- include/linux/workqueue.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 11e1fac18f0e..4e1237ff4b24 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -94,7 +94,7 @@ static inline void timer_init_key_on_stack(struct timer_list *timer, timer_init_key((_timer), (_fn), (_flags), #_timer, &__key);\ } while (0) -#define __init_timer_on_stack(_timer, _fn, _flags) \ +#define __timer_init_on_stack(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ timer_init_key_on_stack((_timer), (_fn), (_flags), \ @@ -103,7 +103,7 @@ static inline void timer_init_key_on_stack(struct timer_list *timer, #else #define __timer_init(_timer, _fn, _flags) \ timer_init_key((_timer), (_fn), (_flags), NULL, NULL) -#define __init_timer_on_stack(_timer, _fn, _flags) \ +#define __timer_init_on_stack(_timer, _fn, _flags) \ timer_init_key_on_stack((_timer), (_fn), (_flags), NULL, NULL) #endif @@ -121,7 +121,7 @@ static inline void timer_init_key_on_stack(struct timer_list *timer, __timer_init((timer), (callback), (flags)) #define timer_setup_on_stack(timer, callback, flags) \ - __init_timer_on_stack((timer), (callback), (flags)) + __timer_init_on_stack((timer), (callback), (flags)) #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void destroy_timer_on_stack(struct timer_list *timer); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 985f69f8fb99..c84da78878d9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -324,7 +324,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } #define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags) \ do { \ INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ - __init_timer_on_stack(&(_work)->timer, \ + __timer_init_on_stack(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) -- cgit v1.2.3 From 220beffd36c26ac68e1c646f91b7ddf4f39039e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 7 May 2025 19:53:34 +0200 Subject: timers: Rename NEXT_TIMER_MAX_DELTA as TIMER_NEXT_MAX_DELTA Move this macro to the canonical TIMER_* namespace. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250507175338.672442-7-mingo@kernel.org --- include/linux/timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 4e1237ff4b24..68cf8e28bbe0 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -156,7 +156,7 @@ extern int timer_reduce(struct timer_list *timer, unsigned long expires); * The jiffies value which is added to now, when there is no timer * in the timer wheel: */ -#define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) +#define TIMER_NEXT_MAX_DELTA ((1UL << 30) - 1) extern void add_timer(struct timer_list *timer); extern void add_timer_local(struct timer_list *timer); -- cgit v1.2.3 From 751e6a394c2ecd08825d5ba527478e171b87144e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 7 May 2025 19:53:35 +0200 Subject: timers: Rename init_timers() as timers_init() Move this API to the canonical timers_*() namespace. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250507175338.672442-8-mingo@kernel.org --- include/linux/timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 68cf8e28bbe0..153d07dad3cd 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -168,7 +168,7 @@ extern int timer_delete(struct timer_list *timer); extern int timer_shutdown_sync(struct timer_list *timer); extern int timer_shutdown(struct timer_list *timer); -extern void init_timers(void); +extern void timers_init(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); -- cgit v1.2.3 From 367ed4e35734d6e7bce1dbca426a5bf150d76905 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 7 May 2025 19:53:36 +0200 Subject: treewide, timers: Rename try_to_del_timer_sync() as timer_delete_sync_try() Move this API to the canonical timer_*() namespace. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250507175338.672442-9-mingo@kernel.org --- include/linux/timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 153d07dad3cd..5b6ff90fcf81 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -162,7 +162,7 @@ extern void add_timer(struct timer_list *timer); extern void add_timer_local(struct timer_list *timer); extern void add_timer_global(struct timer_list *timer); -extern int try_to_del_timer_sync(struct timer_list *timer); +extern int timer_delete_sync_try(struct timer_list *timer); extern int timer_delete_sync(struct timer_list *timer); extern int timer_delete(struct timer_list *timer); extern int timer_shutdown_sync(struct timer_list *timer); -- cgit v1.2.3 From aad823aa3a7d675a8d0de478a04307f63e3725db Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 7 May 2025 19:53:37 +0200 Subject: treewide, timers: Rename destroy_timer_on_stack() as timer_destroy_on_stack() Move this API to the canonical timer_*() namespace. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250507175338.672442-10-mingo@kernel.org --- include/linux/timer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 5b6ff90fcf81..7b53043a2d25 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -115,7 +115,7 @@ static inline void timer_init_key_on_stack(struct timer_list *timer, * * Regular timer initialization should use either DEFINE_TIMER() above, * or timer_setup(). For timers on the stack, timer_setup_on_stack() must - * be used and must be balanced with a call to destroy_timer_on_stack(). + * be used and must be balanced with a call to timer_destroy_on_stack(). */ #define timer_setup(timer, callback, flags) \ __timer_init((timer), (callback), (flags)) @@ -124,9 +124,9 @@ static inline void timer_init_key_on_stack(struct timer_list *timer, __timer_init_on_stack((timer), (callback), (flags)) #ifdef CONFIG_DEBUG_OBJECTS_TIMERS -extern void destroy_timer_on_stack(struct timer_list *timer); +extern void timer_destroy_on_stack(struct timer_list *timer); #else -static inline void destroy_timer_on_stack(struct timer_list *timer) { } +static inline void timer_destroy_on_stack(struct timer_list *timer) { } #endif #define from_timer(var, callback_timer, timer_fieldname) \ -- cgit v1.2.3 From 01475aedfdfa33a5ee3219079426f5743367c624 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 28 Apr 2025 21:34:45 +0200 Subject: futex: Fix outdated comment in struct restart_block Since commit 2070887fdeac ("futex: fix restart in wait_requeue_pi"), futex_wait_requeue_pi() no longer uses restart_block. Update the comment accordingly. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250428193445.4571-1-namcao@linutronix.de --- include/linux/restart_block.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 13f17676c5f4..7e50bbc94e47 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -26,7 +26,7 @@ struct restart_block { unsigned long arch_data; long (*fn)(struct restart_block *); union { - /* For futex_wait and futex_wait_requeue_pi */ + /* For futex_wait() */ struct { u32 __user *uaddr; u32 val; -- cgit v1.2.3 From f400565faa50737ac1d550d2c75128c0dad75765 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Apr 2025 18:11:27 +0200 Subject: perf: Remove too early and redundant CPU hotplug handling The CPU hotplug handlers are called twice: at prepare and online stage. Their role is to: 1) Enable/disable a CPU context. This is irrelevant and even buggy at the prepare stage because the CPU is still offline. On early secondary CPU up, creating an event attached to that CPU might silently fail because the CPU context is observed as online but the context installation's IPI failure is ignored. 2) Update the scope cpumasks and re-migrate the events accordingly in the CPU down case. This is irrelevant at the prepare stage. 3) Remove the events attached to the context of the offlining CPU. It even uses an (unnecessary) IPI for it. This is also irrelevant at the prepare stage. Also none of the *_PREPARE and *_STARTING architecture perf related CPU hotplug callbacks rely on CPUHP_PERF_PREPARE. CPUHP_AP_PERF_ONLINE is enough and the right place to perform the work. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424161128.29176-4-frederic@kernel.org --- include/linux/cpuhotplug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 1987400000b4..df366ee15456 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -60,7 +60,6 @@ enum cpuhp_state { /* PREPARE section invoked on a control CPU */ CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, - CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, CPUHP_PERF_X86_AMD_UNCORE_PREP, CPUHP_PERF_POWER, -- cgit v1.2.3 From 22c64f37e1d4e757b0073a72f1439c2c3509c5cb Mon Sep 17 00:00:00 2001 From: Mohan Kumar G Date: Mon, 5 May 2025 20:58:36 +0530 Subject: wifi: mac80211: Update MCS15 support in link_conf As per IEEE 802.11be-2024 - 9.4.2.321, EHT operation element contains MCS15 Disable subfield as the sixth bit, which is set when MCS15 support is not enabled. Get MCS15 support from EHT operation params and add it in link_conf so that driver can use this value to know if EHT-MCS 15 reception is enabled. Co-developed-by: Dhanavandhana Kannan Signed-off-by: Dhanavandhana Kannan Signed-off-by: Mohan Kumar G Link: https://patch.msgid.link/20250505152836.3266829-1-quic_mkumarg@quicinc.com [remove pointless !! for bool assignment] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 17f917cb4540..420c7f9aa6ee 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2325,6 +2325,7 @@ struct ieee80211_eht_cap_elem { #define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION 0x04 #define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT 0x08 #define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK 0x30 +#define IEEE80211_EHT_OPER_MCS15_DISABLE 0x40 /** * struct ieee80211_eht_operation - eht operation element -- cgit v1.2.3 From 56a7b9f8b05981e929becb45a826558779bca6f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Mar 2025 15:31:50 -0700 Subject: ratelimit: Create functions to handle ratelimit_state internals A number of ratelimit use cases do open-coded access to the ratelimit_state structure's ->missed field. This works, but is a bit messy and makes it more annoying to make changes to this field. Therefore, provide a ratelimit_state_inc_miss() function that increments the ->missed field, a ratelimit_state_get_miss() function that reads out the ->missed field, and a ratelimit_state_reset_miss() function that reads out that field, but that also resets its value to zero. These functions will replace client-code open-coded uses of ->missed. In addition, a new ratelimit_state_reset_interval() function encapsulates what was previously open-coded lock acquisition and direct field updates. [ paulmck: Apply kernel test robot feedback. ] Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Petr Mladek Cc: Andrew Morton Cc: Kuniyuki Iwashima Cc: Mateusz Guzik Cc: Steven Rostedt Cc: John Ogness Cc: Sergey Senozhatsky --- include/linux/ratelimit.h | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index b17e0cd0a30c..8400c5356c18 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -22,16 +22,46 @@ static inline void ratelimit_default_init(struct ratelimit_state *rs) DEFAULT_RATELIMIT_BURST); } +static inline void ratelimit_state_inc_miss(struct ratelimit_state *rs) +{ + rs->missed++; +} + +static inline int ratelimit_state_get_miss(struct ratelimit_state *rs) +{ + return rs->missed; +} + +static inline int ratelimit_state_reset_miss(struct ratelimit_state *rs) +{ + int ret = rs->missed; + + rs->missed = 0; + return ret; +} + +static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, int interval_init) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rs->lock, flags); + rs->interval = interval_init; + rs->begin = 0; + rs->printed = 0; + ratelimit_state_reset_miss(rs); + raw_spin_unlock_irqrestore(&rs->lock, flags); +} + static inline void ratelimit_state_exit(struct ratelimit_state *rs) { + int m; + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) return; - if (rs->missed) { - pr_warn("%s: %d output lines suppressed due to ratelimiting\n", - current->comm, rs->missed); - rs->missed = 0; - } + m = ratelimit_state_reset_miss(rs); + if (m) + pr_warn("%s: %d output lines suppressed due to ratelimiting\n", current->comm, m); } static inline void -- cgit v1.2.3 From 78bf44de47b3cec72ee8ddc14161d5b3212f25e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Mar 2025 09:07:13 -0700 Subject: ratelimit: Convert the ->missed field to atomic_t The ratelimit_state structure's ->missed field is sometimes incremented locklessly, and it would be good to avoid lost counts. This is also needed to count the number of misses due to trylock failure. Therefore, convert the ratelimit_state structure's ->missed field to atomic_t. Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Petr Mladek Cc: Andrew Morton Cc: Kuniyuki Iwashima Cc: Mateusz Guzik Cc: Steven Rostedt Cc: John Ogness Cc: Sergey Senozhatsky --- include/linux/ratelimit.h | 9 +++------ include/linux/ratelimit_types.h | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index 8400c5356c18..c78b92b3e5cd 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -24,20 +24,17 @@ static inline void ratelimit_default_init(struct ratelimit_state *rs) static inline void ratelimit_state_inc_miss(struct ratelimit_state *rs) { - rs->missed++; + atomic_inc(&rs->missed); } static inline int ratelimit_state_get_miss(struct ratelimit_state *rs) { - return rs->missed; + return atomic_read(&rs->missed); } static inline int ratelimit_state_reset_miss(struct ratelimit_state *rs) { - int ret = rs->missed; - - rs->missed = 0; - return ret; + return atomic_xchg_relaxed(&rs->missed, 0); } static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, int interval_init) diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index 765232ce0b5e..d21fe82b67f6 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -18,7 +18,7 @@ struct ratelimit_state { int interval; int burst; int printed; - int missed; + atomic_t missed; unsigned int flags; unsigned long begin; }; -- cgit v1.2.3 From e64a348dc148af41cec266b2143305a2d0228ee7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Mar 2025 15:32:54 -0700 Subject: ratelimit: Avoid jiffies=0 special case The ___ratelimit() function special-cases the jiffies-counter value of zero as "uninitialized". This works well on 64-bit systems, where the jiffies counter is not going to return to zero for more than half a billion years on systems with HZ=1000, but similar 32-bit systems take less than 50 days to wrap the jiffies counter. And although the consequences of wrapping the jiffies counter seem to be limited to minor confusion on the duration of the rate-limiting interval that happens to end at time zero, it is almost no work to avoid this confusion. Therefore, introduce a RATELIMIT_INITIALIZED bit to the ratelimit_state structure's ->flags field so that a ->begin value of zero is no longer special. Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Paul E. McKenney Reviewed-by: Petr Mladek Cc: Andrew Morton Cc: Kuniyuki Iwashima Cc: Mateusz Guzik Cc: Steven Rostedt Cc: John Ogness Cc: Sergey Senozhatsky --- include/linux/ratelimit.h | 2 +- include/linux/ratelimit_types.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index c78b92b3e5cd..adfec24061d1 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -43,7 +43,7 @@ static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, in raw_spin_lock_irqsave(&rs->lock, flags); rs->interval = interval_init; - rs->begin = 0; + rs->flags &= ~RATELIMIT_INITIALIZED; rs->printed = 0; ratelimit_state_reset_miss(rs); raw_spin_unlock_irqrestore(&rs->lock, flags); diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index d21fe82b67f6..ef6711b6b229 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -11,6 +11,7 @@ /* issue num suppressed message on exit */ #define RATELIMIT_MSG_ON_RELEASE BIT(0) +#define RATELIMIT_INITIALIZED BIT(1) struct ratelimit_state { raw_spinlock_t lock; /* protect the state */ -- cgit v1.2.3 From cf8cfa8a9978bfe77f2648a04824a46d58258e68 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 9 Apr 2025 16:54:33 -0700 Subject: ratelimit: Reduce ___ratelimit() false-positive rate limiting Retain the locked design, but check rate-limiting even when the lock could not be acquired. Link: https://lore.kernel.org/all/Z_VRo63o2UsVoxLG@pathway.suse.cz/ Link: https://lore.kernel.org/all/fbe93a52-365e-47fe-93a4-44a44547d601@paulmck-laptop/ Link: https://lore.kernel.org/all/20250423115409.3425-1-spasswolf@web.de/ Signed-off-by: Petr Mladek Signed-off-by: Paul E. McKenney Cc: Petr Mladek Cc: Andrew Morton Cc: Kuniyuki Iwashima Cc: Mateusz Guzik Cc: Steven Rostedt Cc: John Ogness Cc: Sergey Senozhatsky --- include/linux/ratelimit.h | 2 +- include/linux/ratelimit_types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index adfec24061d1..7aaad158ee37 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -44,7 +44,7 @@ static inline void ratelimit_state_reset_interval(struct ratelimit_state *rs, in raw_spin_lock_irqsave(&rs->lock, flags); rs->interval = interval_init; rs->flags &= ~RATELIMIT_INITIALIZED; - rs->printed = 0; + atomic_set(&rs->rs_n_left, rs->burst); ratelimit_state_reset_miss(rs); raw_spin_unlock_irqrestore(&rs->lock, flags); } diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index ef6711b6b229..b19c4354540a 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -18,7 +18,7 @@ struct ratelimit_state { int interval; int burst; - int printed; + atomic_t rs_n_left; atomic_t missed; unsigned int flags; unsigned long begin; -- cgit v1.2.3 From b9e22b35d4598aefed642928ed2856a9900e5b37 Mon Sep 17 00:00:00 2001 From: Cedric Xing Date: Thu, 8 May 2025 19:07:39 -0700 Subject: tsm-mr: Add TVM Measurement Register support Introduce new TSM Measurement helper library (tsm-mr) for TVM guest drivers to expose MRs (Measurement Registers) as sysfs attributes, with Crypto Agility support. Add the following new APIs (see include/linux/tsm-mr.h for details): - tsm_mr_create_attribute_group(): Take on input a `struct tsm_measurements` instance, which includes one `struct tsm_measurement_register` per MR with properties like `TSM_MR_F_READABLE` and `TSM_MR_F_WRITABLE`, to determine the supported operations and create the sysfs attributes accordingly. On success, return a `struct attribute_group` instance that will typically be included by the guest driver into `miscdevice.groups` before calling misc_register(). - tsm_mr_free_attribute_group(): Free the memory allocated to the attrubute group returned by tsm_mr_create_attribute_group(). tsm_mr_create_attribute_group() creates one attribute for each MR, with names following this pattern: MRNAME[:HASH] - MRNAME - Placeholder for the MR name, as specified by `tsm_measurement_register.mr_name`. - :HASH - Optional suffix indicating the hash algorithm associated with this MR, as specified by `tsm_measurement_register.mr_hash`. Support Crypto Agility by allowing multiple definitions of the same MR (i.e., with the same `mr_name`) with distinct HASH algorithms. NOTE: Crypto Agility, introduced in TPM 2.0, allows new hash algorithms to be introduced without breaking compatibility with applications using older algorithms. CC architectures may face the same challenge in the future, needing new hashes for security while retaining compatibility with older hashes, hence the need for Crypto Agility. Signed-off-by: Cedric Xing Reviewed-by: Dan Williams Acked-by: Dionna Amalie Glaze [djbw: fixup bin_attr const conflict] Link: https://patch.msgid.link/20250509020739.882913-1-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/tsm-mr.h | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 include/linux/tsm-mr.h (limited to 'include/linux') diff --git a/include/linux/tsm-mr.h b/include/linux/tsm-mr.h new file mode 100644 index 000000000000..50a521f4ac97 --- /dev/null +++ b/include/linux/tsm-mr.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __TSM_MR_H +#define __TSM_MR_H + +#include + +/** + * struct tsm_measurement_register - describes an architectural measurement + * register (MR) + * @mr_name: name of the MR + * @mr_value: buffer containing the current value of the MR + * @mr_size: size of the MR - typically the digest size of @mr_hash + * @mr_flags: bitwise OR of one or more flags, detailed below + * @mr_hash: optional hash identifier defined in include/uapi/linux/hash_info.h. + * + * A CC guest driver encloses an array of this structure in struct + * tsm_measurements to detail the measurement facility supported by the + * underlying CC hardware. + * + * @mr_name and @mr_value must stay valid until this structure is no longer in + * use. + * + * @mr_flags is the bitwise-OR of zero or more of the flags below. + * + * * %TSM_MR_F_READABLE - the sysfs attribute corresponding to this MR is readable. + * * %TSM_MR_F_WRITABLE - the sysfs attribute corresponding to this MR is writable. + * The semantics is typically to extend the MR but could vary depending on the + * architecture and the MR. + * * %TSM_MR_F_LIVE - this MR's value may differ from the last value written, so + * must be read back from the underlying CC hardware/firmware. + * * %TSM_MR_F_RTMR - bitwise-OR of %TSM_MR_F_LIVE and %TSM_MR_F_WRITABLE. + * * %TSM_MR_F_NOHASH - this MR does NOT have an associated hash algorithm. + * @mr_hash will be ignored when this flag is set. + */ +struct tsm_measurement_register { + const char *mr_name; + void *mr_value; + u32 mr_size; + u32 mr_flags; + enum hash_algo mr_hash; +}; + +#define TSM_MR_F_NOHASH 1 +#define TSM_MR_F_WRITABLE 2 +#define TSM_MR_F_READABLE 4 +#define TSM_MR_F_LIVE 8 +#define TSM_MR_F_RTMR (TSM_MR_F_LIVE | TSM_MR_F_WRITABLE) + +#define TSM_MR_(mr, hash) \ + .mr_name = #mr, .mr_size = hash##_DIGEST_SIZE, \ + .mr_hash = HASH_ALGO_##hash, .mr_flags = TSM_MR_F_READABLE + +/** + * struct tsm_measurements - defines the CC architecture specific measurement + * facility and methods for updating measurement registers (MRs) + * @mrs: Array of MR definitions. + * @nr_mrs: Number of elements in @mrs. + * @refresh: Callback function to load/sync all MRs from TVM hardware/firmware + * into the kernel cache. + * @write: Callback function to write to the MR specified by the parameter @mr. + * Typically, writing to an MR extends the input buffer to that MR. + * + * The @refresh callback is invoked when an MR with %TSM_MR_F_LIVE set is being + * read and the cache is stale. It must reload all MRs with %TSM_MR_F_LIVE set. + * The function parameter @tm is a pointer pointing back to this structure. + * + * The @write callback is invoked whenever an MR is being written. It takes two + * additional parameters besides @tm: + * + * * @mr - points to the MR (an element of @tm->mrs) being written. + * * @data - contains the bytes to write and whose size is @mr->mr_size. + * + * Both @refresh and @write should return 0 on success and an appropriate error + * code on failure. + */ +struct tsm_measurements { + const struct tsm_measurement_register *mrs; + size_t nr_mrs; + int (*refresh)(const struct tsm_measurements *tm); + int (*write)(const struct tsm_measurements *tm, + const struct tsm_measurement_register *mr, const u8 *data); +}; + +const struct attribute_group * +tsm_mr_create_attribute_group(const struct tsm_measurements *tm); +void tsm_mr_free_attribute_group(const struct attribute_group *attr_grp); + +#endif -- cgit v1.2.3 From 4701073c3debd16d7f534f3eb808bd9b50601c0c Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 6 May 2025 16:07:22 +0800 Subject: net: enetc: add initial netc-lib driver to support NTMP Some NETC functionality is controlled using control messages sent to the hardware using BD ring interface with 32B descriptor similar to transmit BD ring used on ENETC. This BD ring interface is referred to as command BD ring. It is used to configure functionality where the underlying resources may be shared between different entities or being too large to configure using direct registers. Therefore, a messaging protocol called NETC Table Management Protocol (NTMP) is provided for exchanging configuration and management information between the software and the hardware using the command BD ring interface. For the management protocol of LS1028A has been retroactively named NTMP 1.0, and its implementation is in enetc_cbdr.c and enetc_qos.c. However, NTMP of i.MX95 has been upgraded to version 2.0, which is incompatible with LS1028A, because the message formats have been changed. Therefore, add the netc-lib driver to support NTMP 2.0 to operate various tables. Note that, only MAC address filter table and RSS table are supported at the moment. More tables will be supported in subsequent patches. It is worth mentioning that the purpose of the netc-lib driver is to provide some NTMP-based generic interfaces for ENETC and NETC Switch drivers. Currently, it only supports the configurations of some tables. Interfaces such as tc flower and debugfs will be added in the future. Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250506080735.3444381-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/fsl/ntmp.h | 121 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 include/linux/fsl/ntmp.h (limited to 'include/linux') diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h new file mode 100644 index 000000000000..916dc4fe7de3 --- /dev/null +++ b/include/linux/fsl/ntmp.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* Copyright 2025 NXP */ +#ifndef __NETC_NTMP_H +#define __NETC_NTMP_H + +#include +#include + +struct maft_keye_data { + u8 mac_addr[ETH_ALEN]; + __le16 resv; +}; + +struct maft_cfge_data { + __le16 si_bitmap; + __le16 resv; +}; + +struct netc_cbdr_regs { + void __iomem *pir; + void __iomem *cir; + void __iomem *mr; + + void __iomem *bar0; + void __iomem *bar1; + void __iomem *lenr; +}; + +struct netc_tbl_vers { + u8 maft_ver; + u8 rsst_ver; +}; + +struct netc_cbdr { + struct device *dev; + struct netc_cbdr_regs regs; + + int bd_num; + int next_to_use; + int next_to_clean; + + int dma_size; + void *addr_base; + void *addr_base_align; + dma_addr_t dma_base; + dma_addr_t dma_base_align; + + /* Serialize the order of command BD ring */ + spinlock_t ring_lock; +}; + +struct ntmp_user { + int cbdr_num; /* number of control BD ring */ + struct device *dev; + struct netc_cbdr *ring; + struct netc_tbl_vers tbl; +}; + +struct maft_entry_data { + struct maft_keye_data keye; + struct maft_cfge_data cfge; +}; + +#if IS_ENABLED(CONFIG_NXP_NETC_LIB) +int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev, + const struct netc_cbdr_regs *regs); +void ntmp_free_cbdr(struct netc_cbdr *cbdr); + +/* NTMP APIs */ +int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft); +int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft); +int ntmp_maft_delete_entry(struct ntmp_user *user, u32 entry_id); +int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table, + int count); +int ntmp_rsst_query_entry(struct ntmp_user *user, + u32 *table, int count); +#else +static inline int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev, + const struct netc_cbdr_regs *regs) +{ + return 0; +} + +static inline void ntmp_free_cbdr(struct netc_cbdr *cbdr) +{ +} + +static inline int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft) +{ + return 0; +} + +static inline int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft) +{ + return 0; +} + +static inline int ntmp_maft_delete_entry(struct ntmp_user *user, u32 entry_id) +{ + return 0; +} + +static inline int ntmp_rsst_update_entry(struct ntmp_user *user, + const u32 *table, int count) +{ + return 0; +} + +static inline int ntmp_rsst_query_entry(struct ntmp_user *user, + u32 *table, int count) +{ + return 0; +} + +#endif + +#endif -- cgit v1.2.3 From 1af3331764b9356fadc4652af77bbbc97f3d7f78 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 29 Mar 2025 09:42:19 +0100 Subject: super: add filesystem freezing helpers for suspend and hibernate Allow the power subsystem to support filesystem freeze for suspend and hibernate. For some kernel subsystems it is paramount that they are guaranteed that they are the owner of the freeze to avoid any risk of deadlocks. This is the case for the power subsystem. Enable it to recognize whether it did actually freeze the filesystem. If userspace has 10 filesystems and suspend/hibernate manges to freeze 5 and then fails on the 6th for whatever odd reason (current or future) then power needs to undo the freeze of the first 5 filesystems. It can't just walk the list again because while it's unlikely that a new filesystem got added in the meantime it still cannot tell which filesystems the power subsystem actually managed to get a freeze reference count on that needs to be dropped during thaw. There's various ways out of this ugliness. For example, record the filesystems the power subsystem managed to freeze on a temporary list in the callbacks and then walk that list backwards during thaw to undo the freezing or make sure that the power subsystem just actually exclusively freezes things it can freeze and marking such filesystems as being owned by power for the duration of the suspend or resume cycle. I opted for the latter as that seemed the clean thing to do even if it means more code changes. If hibernation races with filesystem freezing (e.g. DM reconfiguration), then hibernation need not freeze a filesystem because it's already frozen but userspace may thaw the filesystem before hibernation actually happens. If the race happens the other way around, DM reconfiguration may unexpectedly fail with EBUSY. So allow FREEZE_EXCL to nest with other holders. An exclusive freezer cannot be undone by any of the other concurrent freezers. Link: https://lore.kernel.org/r/20250329-work-freeze-v2-6-a47af37ecc3d@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 96f8c8a794ea..7a3f821d2723 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1307,6 +1307,7 @@ struct sb_writers { unsigned short frozen; /* Is sb frozen? */ int freeze_kcount; /* How many kernel freeze requests? */ int freeze_ucount; /* How many userspace freeze requests? */ + const void *freeze_owner; /* Owner of the freeze */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; @@ -2269,6 +2270,7 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed + * @FREEZE_EXCL: a freeze that can only be undone by the owner * * Indicate who the owner of the freeze or thaw request is and whether * the freeze needs to be exclusive or can nest. @@ -2282,6 +2284,7 @@ enum freeze_holder { FREEZE_HOLDER_KERNEL = (1U << 0), FREEZE_HOLDER_USERSPACE = (1U << 1), FREEZE_MAY_NEST = (1U << 2), + FREEZE_EXCL = (1U << 3), }; struct super_operations { @@ -2295,9 +2298,9 @@ struct super_operations { void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_super) (struct super_block *, enum freeze_holder who); + int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner); int (*freeze_fs) (struct super_block *); - int (*thaw_super) (struct super_block *, enum freeze_holder who); + int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); @@ -2705,8 +2708,10 @@ extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); -int freeze_super(struct super_block *super, enum freeze_holder who); -int thaw_super(struct super_block *super, enum freeze_holder who); +int freeze_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); +int thaw_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); @@ -3518,6 +3523,8 @@ extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); +void filesystems_freeze(void); +void filesystems_thaw(void); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); -- cgit v1.2.3 From 79fb8d8d93e41be4ebda8c9cc507e20297277231 Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Thu, 8 May 2025 14:49:30 -0400 Subject: include/cgroup: separate {get,put}_cgroup_ns no-op case When CONFIG_CGROUPS is not selected, {get,put}_cgroup_ns become no-ops and therefore it is not necessary to compile in the code for changing the reference count. When CONFIG_CGROUP is selected, there is no valid case where either of {get,put}_cgroup_ns() will be called with a NULL argument. Signed-off-by: Joel Savitz Link: https://lore.kernel.org/20250508184930.183040-3-jsavitz@redhat.com Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 28e999f2c642..8da19c3627b0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -785,6 +785,17 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +static inline void get_cgroup_ns(struct cgroup_namespace *ns) +{ + refcount_inc(&ns->ns.count); +} + +static inline void put_cgroup_ns(struct cgroup_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_cgroup_ns(ns); +} + #else /* !CONFIG_CGROUPS */ static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } @@ -795,19 +806,10 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, return old_ns; } -#endif /* !CONFIG_CGROUPS */ +static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } +static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } -static inline void get_cgroup_ns(struct cgroup_namespace *ns) -{ - if (ns) - refcount_inc(&ns->ns.count); -} - -static inline void put_cgroup_ns(struct cgroup_namespace *ns) -{ - if (ns && refcount_dec_and_test(&ns->ns.count)) - free_cgroup_ns(ns); -} +#endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS -- cgit v1.2.3 From 91e40668e70a08cf09c0e22023807b01d78d8b3a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 22 Apr 2025 09:18:11 +0100 Subject: mm/page_table_check: Batch-check pmds/puds just like ptes Convert page_table_check_p[mu]d_set(...) to page_table_check_p[mu]ds_set(..., nr) to allow checking a contiguous set of pmds/puds in single batch. We retain page_table_check_p[mu]d_set(...) as macros that call new batch functions with nr=1 for compatibility. arm64 is about to reorganise its pte/pmd/pud helpers to reuse more code and to allow the implementation for huge_pte to more efficiently set ptes/pmds/puds in batches. We need these batch-helpers to make the refactoring possible. Reviewed-by: Anshuman Khandual Reviewed-by: Pasha Tatashin Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Tested-by: Luiz Capitulino Link: https://lore.kernel.org/r/20250422081822.1836315-4-ryan.roberts@arm.com Signed-off-by: Will Deacon --- include/linux/page_table_check.h | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 6722941c7cb8..289620d4aad3 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -19,8 +19,10 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr); -void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); -void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); +void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, + unsigned int nr); +void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, + unsigned int nr); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); @@ -74,22 +76,22 @@ static inline void page_table_check_ptes_set(struct mm_struct *mm, __page_table_check_ptes_set(mm, ptep, pte, nr); } -static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, - pmd_t pmd) +static inline void page_table_check_pmds_set(struct mm_struct *mm, + pmd_t *pmdp, pmd_t pmd, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_set(mm, pmdp, pmd); + __page_table_check_pmds_set(mm, pmdp, pmd, nr); } -static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, - pud_t pud) +static inline void page_table_check_puds_set(struct mm_struct *mm, + pud_t *pudp, pud_t pud, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_set(mm, pudp, pud); + __page_table_check_puds_set(mm, pudp, pud, nr); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, @@ -129,13 +131,13 @@ static inline void page_table_check_ptes_set(struct mm_struct *mm, { } -static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, - pmd_t pmd) +static inline void page_table_check_pmds_set(struct mm_struct *mm, + pmd_t *pmdp, pmd_t pmd, unsigned int nr) { } -static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, - pud_t pud) +static inline void page_table_check_puds_set(struct mm_struct *mm, + pud_t *pudp, pud_t pud, unsigned int nr) { } @@ -146,4 +148,8 @@ static inline void page_table_check_pte_clear_range(struct mm_struct *mm, } #endif /* CONFIG_PAGE_TABLE_CHECK */ + +#define page_table_check_pmd_set(mm, pmdp, pmd) page_table_check_pmds_set(mm, pmdp, pmd, 1) +#define page_table_check_pud_set(mm, pudp, pud) page_table_check_puds_set(mm, pudp, pud, 1) + #endif /* __LINUX_PAGE_TABLE_CHECK_H */ -- cgit v1.2.3 From 2fba13371fe80b4d0d533a502e460ce0e936d024 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 22 Apr 2025 09:18:16 +0100 Subject: mm/vmalloc: Gracefully unmap huge ptes Commit f7ee1f13d606 ("mm/vmalloc: enable mapping of huge pages at pte level in vmap") added its support by reusing the set_huge_pte_at() API, which is otherwise only used for user mappings. But when unmapping those huge ptes, it continued to call ptep_get_and_clear(), which is a layering violation. To date, the only arch to implement this support is powerpc and it all happens to work ok for it. But arm64's implementation of ptep_get_and_clear() can not be safely used to clear a previous set_huge_pte_at(). So let's introduce a new arch opt-in function, arch_vmap_pte_range_unmap_size(), which can provide the size of a (present) pte. Then we can call huge_ptep_get_and_clear() to tear it down properly. Note that if vunmap_range() is called with a range that starts in the middle of a huge pte-mapped page, we must unmap the entire huge page so the behaviour is consistent with pmd and pud block mappings. In this case emit a warning just like we do for pmd/pud mappings. Reviewed-by: Anshuman Khandual Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Tested-by: Luiz Capitulino Link: https://lore.kernel.org/r/20250422081822.1836315-9-ryan.roberts@arm.com Signed-off-by: Will Deacon --- include/linux/vmalloc.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 31e9ffd936e3..16dd4cba64f2 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -113,6 +113,14 @@ static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, uns } #endif +#ifndef arch_vmap_pte_range_unmap_size +static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr, + pte_t *ptep) +{ + return PAGE_SIZE; +} +#endif + #ifndef arch_vmap_pte_supported_shift static inline int arch_vmap_pte_supported_shift(unsigned long size) { -- cgit v1.2.3 From 63de899cb6220357dea9d0f4e5aa459ff5193bb0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:53 +0100 Subject: io_uring: count allocated requests Keep track of the number requests a ring currently has allocated (and not freed), it'll be needed in the next patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c8f8308294dc2a1cb8925d984d937d4fc14ab5d4.1746788718.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7e23e993280e..73b289b48280 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -435,6 +435,7 @@ struct io_ring_ctx { /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + unsigned nr_req_allocated; /* * Protection for resize vs mmap races - both the mmap and resize -- cgit v1.2.3 From bb71e40db1b7a7822f434d211b4a94bab9996cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Fri, 9 May 2025 14:22:39 +0100 Subject: mfd: max77759: Add Maxim MAX77759 core driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Maxim MAX77759 is a companion PMIC for USB Type-C applications and includes Battery Charger, Fuel Gauge, temperature sensors, USB Type-C Port Controller (TCPC), NVMEM, and a GPIO expander. Fuel Gauge and TCPC have separate and independent I2C addresses, register maps, and interrupt lines and are therefore excluded from the MFD core device driver here. The GPIO and NVMEM interfaces are accessed via specific commands to the built-in microprocessor. This driver implements an API that client drivers can use for accessing those. Signed-off-by: André Draszik Acked-by: Peter Griffin Link: https://lore.kernel.org/r/20250509-max77759-mfd-v10-1-962ac15ee3ef@linaro.org Signed-off-by: Lee Jones --- include/linux/mfd/max77759.h | 165 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 include/linux/mfd/max77759.h (limited to 'include/linux') diff --git a/include/linux/mfd/max77759.h b/include/linux/mfd/max77759.h new file mode 100644 index 000000000000..c6face34e385 --- /dev/null +++ b/include/linux/mfd/max77759.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2020 Google Inc. + * Copyright 2025 Linaro Ltd. + * + * Maxim MAX77759 core driver + */ + +#ifndef __LINUX_MFD_MAX77759_H +#define __LINUX_MFD_MAX77759_H + +#include +#include +#include + +#define MAX77759_PMIC_REG_PMIC_ID 0x00 +#define MAX77759_PMIC_REG_PMIC_REVISION 0x01 +#define MAX77759_PMIC_REG_OTP_REVISION 0x02 +#define MAX77759_PMIC_REG_INTSRC 0x22 +#define MAX77759_PMIC_REG_INTSRCMASK 0x23 +#define MAX77759_PMIC_REG_INTSRC_MAXQ BIT(3) +#define MAX77759_PMIC_REG_INTSRC_TOPSYS BIT(1) +#define MAX77759_PMIC_REG_INTSRC_CHGR BIT(0) +#define MAX77759_PMIC_REG_TOPSYS_INT 0x24 +#define MAX77759_PMIC_REG_TOPSYS_INT_MASK 0x26 +#define MAX77759_PMIC_REG_TOPSYS_INT_TSHDN BIT(6) +#define MAX77759_PMIC_REG_TOPSYS_INT_SYSOVLO BIT(5) +#define MAX77759_PMIC_REG_TOPSYS_INT_SYSUVLO BIT(4) +#define MAX77759_PMIC_REG_TOPSYS_INT_FSHIP BIT(0) +#define MAX77759_PMIC_REG_I2C_CNFG 0x40 +#define MAX77759_PMIC_REG_SWRESET 0x50 +#define MAX77759_PMIC_REG_CONTROL_FG 0x51 + +#define MAX77759_MAXQ_REG_UIC_INT1 0x64 +#define MAX77759_MAXQ_REG_UIC_INT1_APCMDRESI BIT(7) +#define MAX77759_MAXQ_REG_UIC_INT1_SYSMSGI BIT(6) +#define MAX77759_MAXQ_REG_UIC_INT1_GPIO6I BIT(1) +#define MAX77759_MAXQ_REG_UIC_INT1_GPIO5I BIT(0) +#define MAX77759_MAXQ_REG_UIC_INT1_GPIOxI(offs, en) (((en) & 1) << (offs)) +#define MAX77759_MAXQ_REG_UIC_INT1_GPIOxI_MASK(offs) \ + MAX77759_MAXQ_REG_UIC_INT1_GPIOxI(offs, ~0) +#define MAX77759_MAXQ_REG_UIC_INT2 0x65 +#define MAX77759_MAXQ_REG_UIC_INT3 0x66 +#define MAX77759_MAXQ_REG_UIC_INT4 0x67 +#define MAX77759_MAXQ_REG_UIC_UIC_STATUS1 0x68 +#define MAX77759_MAXQ_REG_UIC_UIC_STATUS2 0x69 +#define MAX77759_MAXQ_REG_UIC_UIC_STATUS3 0x6a +#define MAX77759_MAXQ_REG_UIC_UIC_STATUS4 0x6b +#define MAX77759_MAXQ_REG_UIC_UIC_STATUS5 0x6c +#define MAX77759_MAXQ_REG_UIC_UIC_STATUS6 0x6d +#define MAX77759_MAXQ_REG_UIC_UIC_STATUS7 0x6f +#define MAX77759_MAXQ_REG_UIC_UIC_STATUS8 0x6f +#define MAX77759_MAXQ_REG_UIC_INT1_M 0x70 +#define MAX77759_MAXQ_REG_UIC_INT2_M 0x71 +#define MAX77759_MAXQ_REG_UIC_INT3_M 0x72 +#define MAX77759_MAXQ_REG_UIC_INT4_M 0x73 +#define MAX77759_MAXQ_REG_AP_DATAOUT0 0x81 +#define MAX77759_MAXQ_REG_AP_DATAOUT32 0xa1 +#define MAX77759_MAXQ_REG_AP_DATAIN0 0xb1 +#define MAX77759_MAXQ_REG_UIC_SWRST 0xe0 + +#define MAX77759_CHGR_REG_CHG_INT 0xb0 +#define MAX77759_CHGR_REG_CHG_INT2 0xb1 +#define MAX77759_CHGR_REG_CHG_INT_MASK 0xb2 +#define MAX77759_CHGR_REG_CHG_INT2_MASK 0xb3 +#define MAX77759_CHGR_REG_CHG_INT_OK 0xb4 +#define MAX77759_CHGR_REG_CHG_DETAILS_00 0xb5 +#define MAX77759_CHGR_REG_CHG_DETAILS_01 0xb6 +#define MAX77759_CHGR_REG_CHG_DETAILS_02 0xb7 +#define MAX77759_CHGR_REG_CHG_DETAILS_03 0xb8 +#define MAX77759_CHGR_REG_CHG_CNFG_00 0xb9 +#define MAX77759_CHGR_REG_CHG_CNFG_01 0xba +#define MAX77759_CHGR_REG_CHG_CNFG_02 0xbb +#define MAX77759_CHGR_REG_CHG_CNFG_03 0xbc +#define MAX77759_CHGR_REG_CHG_CNFG_04 0xbd +#define MAX77759_CHGR_REG_CHG_CNFG_05 0xbe +#define MAX77759_CHGR_REG_CHG_CNFG_06 0xbf +#define MAX77759_CHGR_REG_CHG_CNFG_07 0xc0 +#define MAX77759_CHGR_REG_CHG_CNFG_08 0xc1 +#define MAX77759_CHGR_REG_CHG_CNFG_09 0xc2 +#define MAX77759_CHGR_REG_CHG_CNFG_10 0xc3 +#define MAX77759_CHGR_REG_CHG_CNFG_11 0xc4 +#define MAX77759_CHGR_REG_CHG_CNFG_12 0xc5 +#define MAX77759_CHGR_REG_CHG_CNFG_13 0xc6 +#define MAX77759_CHGR_REG_CHG_CNFG_14 0xc7 +#define MAX77759_CHGR_REG_CHG_CNFG_15 0xc8 +#define MAX77759_CHGR_REG_CHG_CNFG_16 0xc9 +#define MAX77759_CHGR_REG_CHG_CNFG_17 0xca +#define MAX77759_CHGR_REG_CHG_CNFG_18 0xcb +#define MAX77759_CHGR_REG_CHG_CNFG_19 0xcc + +/* MaxQ opcodes for max77759_maxq_command() */ +#define MAX77759_MAXQ_OPCODE_MAXLENGTH (MAX77759_MAXQ_REG_AP_DATAOUT32 - \ + MAX77759_MAXQ_REG_AP_DATAOUT0 + \ + 1) + +#define MAX77759_MAXQ_OPCODE_GPIO_TRIGGER_READ 0x21 +#define MAX77759_MAXQ_OPCODE_GPIO_TRIGGER_WRITE 0x22 +#define MAX77759_MAXQ_OPCODE_GPIO_CONTROL_READ 0x23 +#define MAX77759_MAXQ_OPCODE_GPIO_CONTROL_WRITE 0x24 +#define MAX77759_MAXQ_OPCODE_USER_SPACE_READ 0x81 +#define MAX77759_MAXQ_OPCODE_USER_SPACE_WRITE 0x82 + +/** + * struct max77759 - core max77759 internal data structure + * + * @regmap_top: Regmap for accessing TOP registers + * @maxq_lock: Lock for serializing access to MaxQ + * @regmap_maxq: Regmap for accessing MaxQ registers + * @cmd_done: Used to signal completion of a MaxQ command + * @regmap_charger: Regmap for accessing charger registers + * + * The MAX77759 comprises several sub-blocks, namely TOP, MaxQ, Charger, + * Fuel Gauge, and TCPCI. + */ +struct max77759 { + struct regmap *regmap_top; + + /* This protects MaxQ commands - only one can be active */ + struct mutex maxq_lock; + struct regmap *regmap_maxq; + struct completion cmd_done; + + struct regmap *regmap_charger; +}; + +/** + * struct max77759_maxq_command - structure containing the MaxQ command to + * send + * + * @length: The number of bytes to send. + * @cmd: The data to send. + */ +struct max77759_maxq_command { + u8 length; + u8 cmd[] __counted_by(length); +}; + +/** + * struct max77759_maxq_response - structure containing the MaxQ response + * + * @length: The number of bytes to receive. + * @rsp: The data received. Must have at least @length bytes space. + */ +struct max77759_maxq_response { + u8 length; + u8 rsp[] __counted_by(length); +}; + +/** + * max77759_maxq_command() - issue a MaxQ command and wait for the response + * and associated data + * + * @max77759: The core max77759 device handle. + * @cmd: The command to be sent. + * @rsp: Any response data associated with the command will be copied here; + * can be %NULL if the command has no response (other than ACK). + * + * Return: 0 on success, a negative error number otherwise. + */ +int max77759_maxq_command(struct max77759 *max77759, + const struct max77759_maxq_command *cmd, + struct max77759_maxq_response *rsp); + +#endif /* __LINUX_MFD_MAX77759_H */ -- cgit v1.2.3 From ee971630f20fd421fffcdc4543731ebcb54ed6d0 Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Tue, 6 May 2025 14:14:33 +0800 Subject: bpf: Allow some trace helpers for all prog types if it works under NMI and doesn't use any context-dependent things, should be fine for any program type. The detailed discussion is in [1]. [1] https://lore.kernel.org/all/CAEf4Bza6gK3dsrTosk6k3oZgtHesNDSrDd8sdeQ-GiS6oJixQg@mail.gmail.com/ Suggested-by: Andrii Nakryiko Signed-off-by: Feng Yang Signed-off-by: Andrii Nakryiko Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20250506061434.94277-2-yangfeng59949@163.com --- include/linux/bpf-cgroup.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9de7adb68294..4847dcade917 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -427,8 +427,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); -const struct bpf_func_proto * -cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } @@ -465,12 +463,6 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } -static inline const struct bpf_func_proto * -cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return NULL; -} - static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( -- cgit v1.2.3 From c24a65b6a27c78d8540409800886b6622ea86ebf Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:49 -0500 Subject: iidc/ice/irdma: Update IDC to support multiple consumers In preparation of supporting more than a single core PCI driver for RDMA, move ice specific structs like qset_params, qos_info and qos_params from iidc_rdma.h to iidc_rdma_ice.h. Previously, the ice driver was just exporting its entire PF struct to the auxiliary driver, but since each core driver will have its own different PF struct, implement a universal struct that all core drivers can provide to the auxiliary driver through the probe call. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Co-developed-by: Mustafa Ismail Signed-off-by: Mustafa Ismail Co-developed-by: Shiraz Saleem Signed-off-by: Shiraz Saleem Co-developed-by: Tatyana Nikolova Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc_rdma.h | 67 +++++++++----------------------- include/linux/net/intel/iidc_rdma_ice.h | 69 ++++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 7f1910289534..8baad1082042 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -5,7 +5,6 @@ #define _IIDC_RDMA_H_ #include -#include #include #include #include @@ -17,14 +16,19 @@ enum iidc_rdma_event_type { IIDC_RDMA_EVENT_AFTER_MTU_CHANGE, IIDC_RDMA_EVENT_BEFORE_TC_CHANGE, IIDC_RDMA_EVENT_AFTER_TC_CHANGE, + IIDC_RDMA_EVENT_WARN_RESET, IIDC_RDMA_EVENT_CRIT_ERR, IIDC_RDMA_EVENT_NBITS /* must be last */ }; +struct iidc_rdma_event { + DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); + u32 reg; +}; + enum iidc_rdma_reset_type { - IIDC_PFR, - IIDC_CORER, - IIDC_GLOBR, + IIDC_FUNC_RESET, + IIDC_DEV_RESET, }; enum iidc_rdma_protocol { @@ -32,53 +36,22 @@ enum iidc_rdma_protocol { IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), }; -#define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_DSCP_PFC_MODE 0x1 - -/* Struct to hold per RDMA Qset info */ -struct iidc_rdma_qset_params { - /* Qset TEID returned to the RDMA driver in - * ice_add_rdma_qset and used by RDMA driver - * for calls to ice_del_rdma_qset - */ - u32 teid; /* Qset TEID */ - u16 qs_handle; /* RDMA driver provides this */ - u16 vport_id; /* VSI index */ - u8 tc; /* TC branch the Qset should belong to */ -}; - -struct iidc_rdma_qos_info { - u64 tc_ctx; - u8 rel_bw; - u8 prio_type; - u8 egress_virt_up; - u8 ingress_virt_up; -}; - -/* Struct to pass QoS info */ -struct iidc_rdma_qos_params { - struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; - u8 up2tc[IIDC_MAX_USER_PRIORITY]; - u8 vport_relative_bw; - u8 vport_priority_type; - u8 num_tc; - u8 pfc_mode; - u8 dscp_map[DSCP_MAX]; -}; - -struct iidc_rdma_event { - DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); - u32 reg; +/* Structure to be populated by core LAN PCI driver */ +struct iidc_rdma_core_dev_info { + struct pci_dev *pdev; /* PCI device of corresponding to main function */ + struct auxiliary_device *adev; + /* Current active RDMA protocol */ + enum iidc_rdma_protocol rdma_protocol; + void *iidc_priv; /* elements unique to each driver */ }; /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an * instance of this struct dedicated to it. */ - struct iidc_rdma_core_auxiliary_dev { struct auxiliary_device adev; - struct ice_pf *pf; + struct iidc_rdma_core_dev_info *cdev_info; }; /* structure representing the auxiliary driver. This struct is to be @@ -88,12 +61,8 @@ struct iidc_rdma_core_auxiliary_dev { */ struct iidc_rdma_core_auxiliary_drv { struct auxiliary_driver adrv; - /* This event_handler is meant to be a blocking call. For instance, - * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not - * return until the auxiliary driver is ready for the MTU change to - * happen. - */ - void (*event_handler)(struct ice_pf *pf, struct iidc_rdma_event *event); + void (*event_handler)(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_event *event); }; #endif /* _IIDC_RDMA_H_*/ diff --git a/include/linux/net/intel/iidc_rdma_ice.h b/include/linux/net/intel/iidc_rdma_ice.h index 78d10003d776..b40eed0e13fe 100644 --- a/include/linux/net/intel/iidc_rdma_ice.h +++ b/include/linux/net/intel/iidc_rdma_ice.h @@ -4,16 +4,67 @@ #ifndef _IIDC_RDMA_ICE_H_ #define _IIDC_RDMA_ICE_H_ -struct ice_pf; +#include -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, +#define IIDC_MAX_USER_PRIORITY 8 +#define IIDC_DSCP_PFC_MODE 0x1 + +/** + * struct iidc_rdma_qset_params - Struct to hold per RDMA Qset info + * @teid: TEID of the Qset node + * @qs_handle: SW index of the Qset, RDMA provides this + * @vport_id: VSI index + * @tc: Traffic Class branch the QSet should belong to + */ +struct iidc_rdma_qset_params { + /* Qset TEID returned to the RDMA driver in + * ice_add_rdma_qset and used by RDMA driver + * for calls to ice_del_rdma_qset + */ + u32 teid; + u16 qs_handle; + u16 vport_id; + u8 tc; +}; + +struct iidc_rdma_qos_info { + u64 tc_ctx; + u8 rel_bw; + u8 prio_type; + u8 egress_virt_up; + u8 ingress_virt_up; +}; + +/* Struct to pass QoS info */ +struct iidc_rdma_qos_params { + struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; + u8 up2tc[IIDC_MAX_USER_PRIORITY]; + u8 vport_relative_bw; + u8 vport_priority_type; + u8 num_tc; + u8 pfc_mode; + u8 dscp_map[DSCP_MAX]; +}; + +struct iidc_rdma_priv_dev_info { + u8 pf_id; + u16 vport_id; + struct net_device *netdev; + struct iidc_rdma_qos_params qos_info; + u8 __iomem *hw_addr; +}; + +int ice_add_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct iidc_rdma_core_dev_info *cdev, enum iidc_rdma_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, - struct iidc_rdma_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +int ice_rdma_update_vsi_filter(struct iidc_rdma_core_dev_info *cdev, u16 vsi_id, + bool enable); +int ice_alloc_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry); +void ice_free_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry); #endif /* _IIDC_RDMA_ICE_H_*/ -- cgit v1.2.3 From 092a38565ed87cbda6fe7dd16c742df6f907483e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 May 2025 17:21:13 -0400 Subject: ring-buffer: Add ring_buffer_record_is_on_cpu() Add the function ring_buffer_record_is_on_cpu() that returns true if the ring buffer for a give CPU is writable and false otherwise. Also add tracer_tracing_is_on_cpu() to return if the ring buffer for a given CPU is writeable for a given trace_array. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250505212236.059853898@goodmis.org Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 56e27263acf8..cd7f0ae26615 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -192,6 +192,7 @@ void ring_buffer_record_off(struct trace_buffer *buffer); void ring_buffer_record_on(struct trace_buffer *buffer); bool ring_buffer_record_is_on(struct trace_buffer *buffer); bool ring_buffer_record_is_set_on(struct trace_buffer *buffer); +bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu); -- cgit v1.2.3 From f4818881c47fd91fcb6d62373c57c7844e3de1c0 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 20:23:23 -0700 Subject: x86/its: Enable Indirect Target Selection mitigation Indirect Target Selection (ITS) is a bug in some pre-ADL Intel CPUs with eIBRS. It affects prediction of indirect branch and RETs in the lower half of cacheline. Due to ITS such branches may get wrongly predicted to a target of (direct or indirect) branch that is located in the upper half of the cacheline. Scope of impact =============== Guest/host isolation -------------------- When eIBRS is used for guest/host isolation, the indirect branches in the VMM may still be predicted with targets corresponding to branches in the guest. Intra-mode ---------- cBPF or other native gadgets can be used for intra-mode training and disclosure using ITS. User/kernel isolation --------------------- When eIBRS is enabled user/kernel isolation is not impacted. Indirect Branch Prediction Barrier (IBPB) ----------------------------------------- After an IBPB, indirect branches may be predicted with targets corresponding to direct branches which were executed prior to IBPB. This is mitigated by a microcode update. Add cmdline parameter indirect_target_selection=off|on|force to control the mitigation to relocate the affected branches to an ITS-safe thunk i.e. located in the upper half of cacheline. Also add the sysfs reporting. When retpoline mitigation is deployed, ITS safe-thunks are not needed, because retpoline sequence is already ITS-safe. Similarly, when call depth tracking (CDT) mitigation is deployed (retbleed=stuff), ITS safe return thunk is not used, as CDT prevents RSB-underflow. To not overcomplicate things, ITS mitigation is not supported with spectre-v2 lfence;jmp mitigation. Moreover, it is less practical to deploy lfence;jmp mitigation on ITS affected parts anyways. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre --- include/linux/cpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e3049543008b..3aa955102b34 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -78,6 +78,8 @@ extern ssize_t cpu_show_gds(struct device *dev, extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_ghostwrite(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_indirect_target_selection(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From d6d1e3e6580ca35071ad474381f053cbf1fb6414 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Apr 2025 16:30:11 +0200 Subject: mm/execmem: Unify early execmem_cache behaviour Early kernel memory is RWX, only at the end of early boot (before SMP) do we mark things ROX. Have execmem_cache mirror this behaviour for early users. This avoids having to remember what code is execmem and what is not -- we can poke everything with impunity ;-) Also performance for not having to do endless text_poke_mm switches. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre --- include/linux/execmem.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 65655a5d1be2..089c71cc8ddf 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -53,7 +53,7 @@ enum execmem_range_flags { EXECMEM_ROX_CACHE = (1 << 1), }; -#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +#if defined(CONFIG_ARCH_HAS_EXECMEM_ROX) && defined(CONFIG_EXECMEM) /** * execmem_fill_trapping_insns - set memory to contain instructions that * will trap @@ -93,9 +93,15 @@ int execmem_make_temp_rw(void *ptr, size_t size); * Return: 0 on success or negative error code on failure. */ int execmem_restore_rox(void *ptr, size_t size); + +/* + * Called from mark_readonly(), where the system transitions to ROX. + */ +void execmem_cache_make_ro(void); #else static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } +static inline void execmem_cache_make_ro(void) { } #endif /** -- cgit v1.2.3 From 872df34d7c51a79523820ea6a14860398c639b87 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Oct 2024 10:05:48 -0700 Subject: x86/its: Use dynamic thunks for indirect branches ITS mitigation moves the unsafe indirect branches to a safe thunk. This could degrade the prediction accuracy as the source address of indirect branches becomes same for different execution paths. To improve the predictions, and hence the performance, assign a separate thunk for each indirect callsite. This is also a defense-in-depth measure to avoid indirect branches aliasing with each other. As an example, 5000 dynamic thunks would utilize around 16 bits of the address space, thereby gaining entropy. For a BTB that uses 32 bits for indexing, dynamic thunks could provide better prediction accuracy over fixed thunks. Have ITS thunks be variable sized and use EXECMEM_MODULE_TEXT such that they are both more flexible (got to extend them later) and live in 2M TLBs, just like kernel code, avoiding undue TLB pressure. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre --- include/linux/execmem.h | 3 +++ include/linux/module.h | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 089c71cc8ddf..ca42d5e46ccc 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -4,6 +4,7 @@ #include #include +#include #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) @@ -176,6 +177,8 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +DEFINE_FREE(execmem, void *, if (_T) execmem_free(_T)); + #ifdef CONFIG_MMU /** * execmem_vmap - create virtual mapping for EXECMEM_MODULE_DATA memory diff --git a/include/linux/module.h b/include/linux/module.h index b3329110d668..8050f77c3b64 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -586,6 +586,11 @@ struct module { atomic_t refcnt; #endif +#ifdef CONFIG_MITIGATION_ITS + int its_num_pages; + void **its_page_array; +#endif + #ifdef CONFIG_CONSTRUCTORS /* Constructor functions. */ ctor_fn_t *ctors; -- cgit v1.2.3 From f2987c5816bda01a8ffdd4eb5dfaaa2b41b67039 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 6 May 2025 20:48:59 +0800 Subject: block: export API to get the number of bdev inflight IO - rename part_in_{flight, flight_rw} to bdev_count_{inflight, inflight_rw} - export bdev_count_inflight, to fix a problem in mdraid that foreground IO can be starved by background sync IO in later patches Link: https://lore.kernel.org/linux-raid/20250506124903.2540268-6-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke --- include/linux/part_stat.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index c5e9cac0575e..eeeff2a04529 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -79,4 +79,6 @@ static inline void part_stat_set_all(struct block_device *part, int value) #define part_stat_local_read_cpu(part, field, cpu) \ local_read(&(part_stat_get_cpu(part, field, cpu))) +unsigned int bdev_count_inflight(struct block_device *part); + #endif /* _LINUX_PART_STAT_H */ -- cgit v1.2.3 From 752d0464b78a5b28682256ed7a057106119e1d1a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 6 May 2025 20:49:03 +0800 Subject: md: clean up accounting for issued sync IO It's no longer used and can be removed, also remove the field 'gendisk->sync_io'. Link: https://lore.kernel.org/linux-raid/20250506124903.2540268-10-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 52b7a27c46e8..ad75dee3b213 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -182,7 +182,6 @@ struct gendisk { struct list_head slave_bdevs; #endif struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_ZONED -- cgit v1.2.3 From 094ac8cff7858bee5fa4554f6ea66c964f8e160e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 10 May 2025 10:45:28 +0200 Subject: futex: Relax the rcu_assign_pointer() assignment of mm->futex_phash in futex_mm_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following commit added an rcu_assign_pointer() assignment to futex_mm_init() in : bd54df5ea7ca ("futex: Allow to resize the private local hash") Which breaks the build on older compilers (gcc-9, x86-64 defconfig): CC io_uring/futex.o In file included from ./arch/x86/include/generated/asm/rwonce.h:1, from ./include/linux/compiler.h:390, from ./include/linux/array_size.h:5, from ./include/linux/kernel.h:16, from io_uring/futex.c:2: ./include/linux/futex.h: In function 'futex_mm_init': ./include/linux/rcupdate.h:555:36: error: dereferencing pointer to incomplete type 'struct futex_private_hash' The problem is that this variant of rcu_assign_pointer() wants to know the full type of 'struct futex_private_hash', which type is local to futex.c: kernel/futex/core.c:struct futex_private_hash { There are a couple of mechanical solutions for this bug: - we can uninline futex_mm_init() and move it into futex/core.c - or we can share the structure definition with kernel/fork.c. But both of these solutions have disadvantages: the first one adds runtime overhead, while the second one dis-encapsulates private futex types. A third solution, implemented by this patch, is to just initialize mm->futex_phash with NULL like the patch below, it's not like this new MM's ->futex_phash can be observed externally until the task is inserted into the task list, which guarantees full store ordering. The relaxation of this initialization might also give a tiny speedup on certain platforms. Fixes: bd54df5ea7ca ("futex: Allow to resize the private local hash") Signed-off-by: Ingo Molnar Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Juri Lelli Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Valentin Schneider Cc: Waiman Long Link: https://lore.kernel.org/r/aB8SI00EHBri23lB@gmail.com --- include/linux/futex.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/futex.h b/include/linux/futex.h index eccc99751bd9..168ffd5996b4 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -88,7 +88,14 @@ void futex_hash_free(struct mm_struct *mm); static inline void futex_mm_init(struct mm_struct *mm) { - rcu_assign_pointer(mm->futex_phash, NULL); + /* + * No need for rcu_assign_pointer() here, as we can rely on + * tasklist_lock write-ordering in copy_process(), before + * the task's MM becomes visible and the ->futex_phash + * becomes externally observable: + */ + mm->futex_phash = NULL; + mutex_init(&mm->futex_hash_lock); } -- cgit v1.2.3 From 1f93d877f09d987f08baedd50597aaaa72a37be4 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 8 May 2025 21:12:07 +0300 Subject: ALSA/hda: intel-sdw-acpi: Correct sdw_intel_acpi_scan() function parameter The acpi_handle should be just a handle and not a pointer in sdw_intel_acpi_scan() parameter list. It is called with 'acpi_handle handle' as parameter and it is passing it to acpi_walk_namespace, which also expects acpi_handle and not acpi_handle* Signed-off-by: Peter Ujfalusi Reviewed-by: Bard Liao Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Link: https://patch.msgid.link/20250508181207.22113-1-peter.ujfalusi@linux.intel.com Signed-off-by: Takashi Iwai --- include/linux/soundwire/sdw_intel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 493d9de4e472..dc6ebaee3d18 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -365,7 +365,7 @@ struct sdw_intel_res { * on e.g. which machine driver to select (I2S mode, HDaudio or * SoundWire). */ -int sdw_intel_acpi_scan(acpi_handle *parent_handle, +int sdw_intel_acpi_scan(acpi_handle parent_handle, struct sdw_intel_acpi_info *info); void sdw_intel_process_wakeen_event(struct sdw_intel_ctx *ctx); -- cgit v1.2.3 From 18c64378ad85ef00e70f196793ee8901a8aa2fa1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Apr 2025 10:22:14 -0400 Subject: sunrpc: add info about xprt queue times to svc_xprt_dequeue tracepoint I've been looking at a problem where we see increased RPC timeouts in clients when the nfs_layout_flexfiles dataserver_timeo value is tuned very low (6s). This is necessary to ensure quick failover to a different mirror if a server goes down, but it causes a lot more major RPC timeouts. Ultimately, the problem is server-side however. It's sometimes doesn't respond to connection attempts. My theory is that the interrupt handler runs when a connection comes in, the xprt ends up being enqueued, but it takes a significant amount of time for the nfsd thread to pick it up. Currently, the svc_xprt_dequeue tracepoint displays "wakeup-us". This is the time between the wake_up() call, and the thread dequeueing the xprt. If no thread was woken, or the thread ended up picking up a different xprt than intended, then this value won't tell us how long the xprt was waiting. Add a new xpt_qtime field to struct svc_xprt and set it in svc_xprt_enqueue(). When the dequeue tracepoint fires, also store the time that the xprt sat on the queue in total. Display it as "qtime-us". Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 72be60952579..369a89aea186 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -53,6 +53,7 @@ struct svc_xprt { struct svc_xprt_class *xpt_class; const struct svc_xprt_ops *xpt_ops; struct kref xpt_ref; + ktime_t xpt_qtime; struct list_head xpt_list; struct lwq_node xpt_ready; unsigned long xpt_flags; -- cgit v1.2.3 From 0ae0227fa31dda5bfc6b5a0145952d46fe57408b Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Tue, 6 May 2025 03:30:34 +0800 Subject: mm/codetag: move tag retrieval back upfront in __free_pages() Commit 51ff4d7486f0 ("mm: avoid extra mem_alloc_profiling_enabled() checks") introduces a possible use-after-free scenario, when page is non-compound, page[0] could be released by other thread right after put_page_testzero failed in current thread, pgalloc_tag_sub_pages afterwards would manipulate an invalid page for accounting remaining pages: [timeline] [thread1] [thread2] | alloc_page non-compound V | get_page, rf counter inc V | in ___free_pages | put_page_testzero fails V | put_page, page released V | in ___free_pages, | pgalloc_tag_sub_pages | manipulate an invalid page V Restore __free_pages() to its state before, retrieve alloc tag beforehand. Link: https://lkml.kernel.org/r/20250505193034.91682-1-00107082@163.com Fixes: 51ff4d7486f0 ("mm: avoid extra mem_alloc_profiling_enabled() checks") Signed-off-by: David Wang <00107082@163.com> Acked-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index c74077977830..8a7f4f802c57 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -188,6 +188,13 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) return tag; } +static inline struct alloc_tag *pgalloc_tag_get(struct page *page) +{ + if (mem_alloc_profiling_enabled()) + return __pgalloc_tag_get(page); + return NULL; +} + void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); void pgalloc_tag_swap(struct folio *new, struct folio *old); @@ -199,6 +206,7 @@ static inline void clear_page_tag_ref(struct page *page) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} +static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } #endif /* CONFIG_MEM_ALLOC_PROFILING */ -- cgit v1.2.3 From cb5b13cd6c9237fe5ac978b22453eb3fa098a8d6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 19:16:56 +0100 Subject: mm: introduce a common definition of mk_pte() Most architectures simply call pfn_pte(). Centralise that as the normal definition and remove the definition of mk_pte() from the architectures which have either that exact definition or something similar. Link: https://lkml.kernel.org/r/20250402181709.2386022-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Geert Uytterhoeven # m68k Acked-by: David Hildenbrand Reviewed-by: Alexander Gordeev # s390 Cc: Zi Yan Cc: Andreas Larsson Cc: Anton Ivanov Cc: Dave Hansen Cc: "David S. Miller" Cc: Johannes Berg Cc: Muchun Song Cc: Richard Weinberger Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index bf55206935c4..aa944eaad0ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2004,6 +2004,15 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } +#ifndef mk_pte +#ifdef CONFIG_MMU +static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) +{ + return pfn_pte(page_to_pfn(page), pgprot); +} +#endif +#endif + static inline bool folio_has_pincount(const struct folio *folio) { if (IS_ENABLED(CONFIG_64BIT)) -- cgit v1.2.3 From 4ec492a628d897806bb6dc13b1c257c4e06eb1cf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 19:17:00 +0100 Subject: mm: make mk_pte() definition unconditional All architectures now use the common mk_pte() definition, so we can remove the condition. Link: https://lkml.kernel.org/r/20250402181709.2386022-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Zi Yan Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anton Ivanov Cc: Dave Hansen Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Johannes Berg Cc: Muchun Song Cc: Richard Weinberger Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index aa944eaad0ec..3a55903d68e2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2004,14 +2004,12 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } -#ifndef mk_pte #ifdef CONFIG_MMU static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) { return pfn_pte(page_to_pfn(page), pgprot); } #endif -#endif static inline bool folio_has_pincount(const struct folio *folio) { -- cgit v1.2.3 From deb8d4d28e4d05c4ecfc6e242c0a53d49e119224 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 19:17:01 +0100 Subject: mm: add folio_mk_pte() Remove a cast from folio to page in four callers of mk_pte(). Link: https://lkml.kernel.org/r/20250402181709.2386022-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Zi Yan Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anton Ivanov Cc: Dave Hansen Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Johannes Berg Cc: Muchun Song Cc: Richard Weinberger Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a55903d68e2..cbad8c663c4d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2009,6 +2009,21 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) { return pfn_pte(page_to_pfn(page), pgprot); } + +/** + * folio_mk_pte - Create a PTE for this folio + * @folio: The folio to create a PTE for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_ptes(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) +{ + return pfn_pte(folio_pfn(folio), pgprot); +} #endif static inline bool folio_has_pincount(const struct folio *folio) -- cgit v1.2.3 From e3981db444a0a18d350d9f92e3f2e8d489b54211 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 19:17:04 +0100 Subject: mm: add folio_mk_pmd() Removes five conversions from folio to page. Also removes both callers of mk_pmd() that aren't part of mk_huge_pmd(), getting us a step closer to removing the confusion between mk_pmd(), mk_huge_pmd() and pmd_mkhuge(). Link: https://lkml.kernel.org/r/20250402181709.2386022-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anton Ivanov Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Johannes Berg Cc: Muchun Song Cc: Richard Weinberger Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cbad8c663c4d..733dd7100ca1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2024,7 +2024,24 @@ static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) { return pfn_pte(folio_pfn(folio), pgprot); } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/** + * folio_mk_pmd - Create a PMD for this folio + * @folio: The folio to create a PMD for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_pmd_at(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) +{ + return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); +} #endif +#endif /* CONFIG_MMU */ static inline bool folio_has_pincount(const struct folio *folio) { -- cgit v1.2.3 From 5071ea3d7b3d1e9660524374083a929a6885d78a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 19:17:05 +0100 Subject: arch: remove mk_pmd() There are now no callers of mk_huge_pmd() and mk_pmd(). Remove them. Link: https://lkml.kernel.org/r/20250402181709.2386022-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Zi Yan Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Anton Ivanov Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Johannes Berg Cc: Muchun Song Cc: Richard Weinberger Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e893d546a49f..f190998b2ebd 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -495,8 +495,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); -#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) - static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); -- cgit v1.2.3 From c09b997342bcd1b3c2b63c6ff6fecf037a68beff Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 22:06:03 +0100 Subject: filemap: remove readahead_page() Patch series "Misc folio patches for 6.16". Remove a few APIs that we've converted everybody from using. I also found a few places that extract a page pointer from i_pages, which will be an invalid thing to do when we separate pages from folios. This patch (of 8): All filesystems have now been converted to call readahead_folio() so we can delete this wrapper. Link: https://lkml.kernel.org/r/20250402210612.2444135-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250402210612.2444135-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 26baa78f1ca7..cd4bd0f8e5f6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1308,9 +1308,9 @@ static inline bool filemap_range_needs_writeback(struct address_space *mapping, * struct readahead_control - Describes a readahead request. * * A readahead request is for consecutive pages. Filesystems which - * implement the ->readahead method should call readahead_page() or - * readahead_page_batch() in a loop and attempt to start I/O against - * each page in the request. + * implement the ->readahead method should call readahead_folio() or + * __readahead_batch() in a loop and attempt to start reads into each + * folio in the request. * * Most of the fields in this struct are private and should be accessed * by the functions below. @@ -1415,22 +1415,6 @@ static inline struct folio *__readahead_folio(struct readahead_control *ractl) return folio; } -/** - * readahead_page - Get the next page to read. - * @ractl: The current readahead request. - * - * Context: The page is locked and has an elevated refcount. The caller - * should decreases the refcount once the page has been submitted for I/O - * and unlock the page once all I/O to that page has completed. - * Return: A pointer to the next page, or %NULL if we are done. - */ -static inline struct page *readahead_page(struct readahead_control *ractl) -{ - struct folio *folio = __readahead_folio(ractl); - - return &folio->page; -} - /** * readahead_folio - Get the next folio to read. * @ractl: The current readahead request. -- cgit v1.2.3 From a55139579082e4bc9ea9b04003cb9f78c781e08b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 22:06:04 +0100 Subject: mm: remove offset_in_thp() All callers have been converted to call offset_in_folio(). Link: https://lkml.kernel.org/r/20250402210612.2444135-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 733dd7100ca1..ae67d3a33792 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2445,7 +2445,6 @@ static inline void clear_page_pfmemalloc(struct page *page) extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) -#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* -- cgit v1.2.3 From 9c532d79082f9f2e58a45f2e92020dda65b7dedd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 22:06:07 +0100 Subject: filemap: remove find_subpage() All users of this function now call folio_file_page() instead. Delete it. Link: https://lkml.kernel.org/r/20250402210612.2444135-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cd4bd0f8e5f6..0ddd4bd8cdf8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -945,19 +945,6 @@ static inline bool folio_contains(struct folio *folio, pgoff_t index) return index - folio_index(folio) < folio_nr_pages(folio); } -/* - * Given the page we found in the page cache, return the page corresponding - * to this index in the file - */ -static inline struct page *find_subpage(struct page *head, pgoff_t index) -{ - /* HugeTLBfs wants the head page regardless */ - if (PageHuge(head)) - return head; - - return head + (index & (thp_nr_pages(head) - 1)); -} - unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, -- cgit v1.2.3 From 8dfc8cbf7b07da172b3a4c8bea064e84fbfa5d56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 22:06:08 +0100 Subject: filemap: convert __readahead_batch() to use a folio Extract folios from i_mapping, not pages. Removes a hidden call to compound_head(), a use of thp_nr_pages() and an unnecessary assertion that we didn't find a tail page in the page cache. Link: https://lkml.kernel.org/r/20250402210612.2444135-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0ddd4bd8cdf8..c5c9b3770d75 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1424,7 +1424,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, { unsigned int i = 0; XA_STATE(xas, &rac->mapping->i_pages, 0); - struct page *page; + struct folio *folio; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; @@ -1433,13 +1433,12 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, xas_set(&xas, rac->_index); rcu_read_lock(); - xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, rac->_index + rac->_nr_pages - 1) { + if (xas_retry(&xas, folio)) continue; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - array[i++] = page; - rac->_batch_count += thp_nr_pages(page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + array[i++] = folio_page(folio, 0); + rac->_batch_count += folio_nr_pages(folio); if (i == array_sz) break; } -- cgit v1.2.3 From 41e422a898da69e903a846dfb2b0c0ff62abc3cd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 22:06:09 +0100 Subject: filemap: remove readahead_page_batch() This function has no more callers; delete it. Link: https://lkml.kernel.org/r/20250402210612.2444135-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c5c9b3770d75..af25fb640463 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1447,20 +1447,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, return i; } -/** - * readahead_page_batch - Get a batch of pages to read. - * @rac: The current readahead request. - * @array: An array of pointers to struct page. - * - * Context: The pages are locked and have an elevated refcount. The caller - * should decreases the refcount once the page has been submitted for I/O - * and unlock the page once all I/O to that page has completed. - * Return: The number of pages placed in the array. 0 indicates the request - * is complete. - */ -#define readahead_page_batch(rac, array) \ - __readahead_batch(rac, array, ARRAY_SIZE(array)) - /** * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. -- cgit v1.2.3 From 2355153ea8185e7deaf9ce728716dadb707ef59f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 22:06:10 +0100 Subject: mm: delete thp_nr_pages() All callers now use folio_nr_pages(). Delete this wrapper. Link: https://lkml.kernel.org/r/20250402210612.2444135-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ae67d3a33792..dcdb798184ef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2223,15 +2223,6 @@ static inline long compound_nr(struct page *page) return folio_large_nr_pages(folio); } -/** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline long thp_nr_pages(struct page *page) -{ - return folio_nr_pages((struct folio *)page); -} - /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. -- cgit v1.2.3 From 56e5a103a721d0ef139bba7ff3d3ada6c8217d5b Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Wed, 2 Apr 2025 13:44:16 -0700 Subject: zsmalloc: prefer the the original page's node for compressed data Currently, zsmalloc, zswap's and zram's backend memory allocator, does not enforce any policy for the allocation of memory for the compressed data, instead just adopting the memory policy of the task entering reclaim, or the default policy (prefer local node) if no such policy is specified. This can lead to several pathological behaviors in multi-node NUMA systems: 1. Systems with CXL-based memory tiering can encounter the following inversion with zswap/zram: the coldest pages demoted to the CXL tier can return to the high tier when they are reclaimed to compressed swap, creating memory pressure on the high tier. 2. Consider a direct reclaimer scanning nodes in order of allocation preference. If it ventures into remote nodes, the memory it compresses there should stay there. Trying to shift those contents over to the reclaiming thread's preferred node further *increases* its local pressure, and provoking more spills. The remote node is also the most likely to refault this data again. This undesirable behavior was pointed out by Johannes Weiner in [1]. 3. For zswap writeback, the zswap entries are organized in node-specific LRUs, based on the node placement of the original pages, allowing for targeted zswap writeback for specific nodes. However, the compressed data of a zswap entry can be placed on a different node from the LRU it is placed on. This means that reclaim targeted at one node might not free up memory used for zswap entries in that node, but instead reclaiming memory in a different node. All of these issues will be resolved if the compressed data go to the same node as the original page. This patch encourages this behavior by having zswap and zram pass the node of the original page to zsmalloc, and have zsmalloc prefer the specified node if we need to allocate new (zs)pages for the compressed data. Note that we are not strictly binding the allocation to the preferred node. We still allow the allocation to fall back to other nodes when the preferred node is full, or if we have zspages with slots available on a different node. This is OK, and still a strict improvement over the status quo: 1. On a system with demotion enabled, we will generally prefer demotions over compressed swapping, and only swap when pages have already gone to the lowest tier. This patch should achieve the desired effect for the most part. 2. If the preferred node is out of memory, letting the compressed data going to other nodes can be better than the alternative (OOMs, keeping cold memory unreclaimed, disk swapping, etc.). 3. If the allocation go to a separate node because we have a zspage with slots available, at least we're not creating extra immediate memory pressure (since the space is already allocated). 3. While there can be mixings, we generally reclaim pages in same-node batches, which encourage zspage grouping that is more likely to go to the right node. 4. A strict binding would require partitioning zsmalloc by node, which is more complicated, and more prone to regression, since it reduces the storage density of zsmalloc. We need to evaluate the tradeoff and benchmark carefully before adopting such an involved solution. [1]: https://lore.kernel.org/linux-mm/20250331165306.GC2110528@cmpxchg.org/ [senozhatsky@chromium.org: coding-style fixes] Link: https://lkml.kernel.org/r/mnvexa7kseswglcqbhlot4zg3b3la2ypv2rimdl5mh5glbmhvz@wi6bgqn47hge Link: https://lkml.kernel.org/r/20250402204416.3435994-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Gregory Price Acked-by: Dan Williams Reviewed-by: Chengming Zhou Acked-by: Sergey Senozhatsky [zram, zsmalloc] Acked-by: Johannes Weiner Acked-by: Yosry Ahmed [zswap/zsmalloc] Cc: "Huang, Ying" Cc: Joanthan Cameron Cc: Minchan Kim Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/zpool.h | 4 ++-- include/linux/zsmalloc.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 52f30e526607..369ef068fad8 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -22,7 +22,7 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, - unsigned long *handle); + unsigned long *handle, const int nid); void zpool_free(struct zpool *pool, unsigned long handle); @@ -64,7 +64,7 @@ struct zpool_driver { void (*destroy)(void *pool); int (*malloc)(void *pool, size_t size, gfp_t gfp, - unsigned long *handle); + unsigned long *handle, const int nid); void (*free)(void *pool, unsigned long handle); void *(*obj_read_begin)(void *pool, unsigned long handle, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index c26baf9fb331..13e9cc5490f7 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -26,7 +26,8 @@ struct zs_pool; struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags, + const int nid); void zs_free(struct zs_pool *pool, unsigned long obj); size_t zs_huge_class_size(struct zs_pool *pool); -- cgit v1.2.3 From 737e9d021993c99377bb61e762c0d6945a37615b Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Mon, 27 Jan 2025 10:34:03 -0500 Subject: memory: implement memory_block_advise/probe_max_size Patch series "memory,x86,acpi: hotplug memory alignment advisement", v8. When physical address regions are not aligned to memory block size, the misaligned portion is lost (stranded capacity). Block size (min/max/selected) is architecture defined. Most architectures tend to use the minimum block size or some simplistic heurist. On x86, memory block size increases up to 2GB, and is otherwise fitted to the alignment of non-hotplug (i.e. not special purpose memory). CXL exposes its memory for management through the ACPI CEDT (CXL Early Detection Table) in a field called the CXL Fixed Memory Window. Per the CXL specification, this memory must be aligned to at least 256MB. When a CFMW aligns on a size less than the block size, this causes a loss of up to 2GB per CFMW on x86. It is not uncommon for CFMW to be allocated per-device - though this behavior is BIOS defined. This patch set provides 3 things: 1) implement advise/query functions in driverse/base/memory.c to report/query architecture agnostic hotplug block alignment advice. 2) update x86 memblock size logic to consider the hotplug advice 3) add code in acpi/numa/srat.c to report CFMW alignment advice The advisement interfaces are design to be called during arch_init code prior to allocator and smp_init. start_kernel will call these through setup_arch() (via acpi and mm/init_64.c on x86), which occurs prior to mm_core_init and smp_init - so no need for atomics. There's an attempt to signal callers to advise() that query has already occurred, but this is predicated on the notion that query actually occurs (which presently only happens on the x86 arch). This is to assist debugging future users. Otherwise, the advise() call has been marked __init to help static discovery of bad call times. Once query is called the first time, it will always return the same value. Interfaces return -EBUSY and 0 respectively on systems without hotplug. This patch (of 3): Hotplug memory sources may have opinions on what the memblock size should be - usually for alignment purposes. For example, CXL memory extents can be 256MB with a matching alignment. If this size/alignment is smaller than the block size, it can result in stranded capacity. Implement memory_block_advise_max_size for use prior to allocator init, for software to advise the system on the max block size. Implement memory_block_probe_max_size for use by arch init code to calculate the best block size. Use of advice is architecture defined. The probe value can never change after first probe. Calls to advise after probe will return -EBUSY to aid debugging. On systems without hotplug, always return -ENODEV and 0 respectively. Link: https://lkml.kernel.org/r/20250127153405.3379117-1-gourry@gourry.net Link: https://lkml.kernel.org/r/20250127153405.3379117-2-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Ira Weiny Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Acked-by: Dan Williams Tested-by: Fan Ni Reviewed-by: Ira Weiny Acked-by: Oscar Salvador Cc: Alison Schofield Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Bruno Faccini Cc: Dave Hansen Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Haibo Xu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joanthan Cameron Cc: Len Brown Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Robert Richter Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- include/linux/memory.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 12daa6ec7d09..5ec4e6d209b9 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -149,6 +149,14 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) { return 0; } +static inline int memory_block_advise_max_size(unsigned long size) +{ + return -ENODEV; +} +static inline unsigned long memory_block_advised_max_size(void) +{ + return 0; +} #else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); @@ -181,6 +189,8 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, void memory_block_add_nid(struct memory_block *mem, int nid, enum meminit_context context); #endif /* CONFIG_NUMA */ +int memory_block_advise_max_size(unsigned long size); +unsigned long memory_block_advised_max_size(void); #endif /* CONFIG_MEMORY_HOTPLUG */ /* -- cgit v1.2.3 From b4c829fa4d56f3b566bbbb41c9a8ff0c83ae84c5 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 31 Mar 2025 19:10:25 -0700 Subject: mm/compaction: use folio in hugetlb pathway Use a folio in the hugetlb pathway during the compaction migrate-able pageblock scan. This removes a call to compound_head(). Link: https://lkml.kernel.org/r/20250401021025.637333-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Oscar Salvador Reviewed-by: Zi Yan Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8f3ac832ee7f..a57bed83c657 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -695,7 +695,7 @@ struct huge_bootmem_page { bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, @@ -1083,7 +1083,7 @@ static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, return NULL; } -static inline int isolate_or_dissolve_huge_page(struct page *page, +static inline int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { return -ENOMEM; -- cgit v1.2.3 From 2e976567233228ff928c2405f7e03ebb7fb7aa50 Mon Sep 17 00:00:00 2001 From: Ignacio Encinas Date: Mon, 31 Mar 2025 21:57:05 +0200 Subject: mm: annotate data race in update_hiwater_rss mm_struct.hiwater_rss can be accessed concurrently without proper synchronization as reported by KCSAN. This data race is benign as it only affects accounting information. Annotate it with data_race() to make KCSAN happy. Link: https://lkml.kernel.org/r/20250331-mm-maxrss-data-race-v2-1-cf958e6205bf@iencinas.com Signed-off-by: Ignacio Encinas Reported-by: syzbot+419c4b42acc36c420ad3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67e3390c.050a0220.1ec46.0001.GAE@google.com/ Suggested-by: Lorenzo Stoakes Acked-by: Pedro Falcato Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index dcdb798184ef..1690f21e7808 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2796,7 +2797,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); - if ((mm)->hiwater_rss < _rss) + if (data_race(mm->hiwater_rss) < _rss) (mm)->hiwater_rss = _rss; } -- cgit v1.2.3 From 979f3ef0f798d9b4fda4806d37fb1a264fc38566 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 21 Mar 2025 22:02:21 +1000 Subject: mm: fix parameter passed to page_mapcount_is_type() Patch series "Fix parameter passed to page_mapcount_is_type()", v2. Found by code inspection. There are two places where the parameter passed to page_mapcount_is_type() is (page->_mapcount), which is incorrect since it should be one more than the value, as explained in the comments to page_mapcount_is_type(): (a) page_has_type() in page-flags.h (b) __dump_folio() in mm/debug.c PATCH[1] fixes the parameter for (a) PATCH[2] fixes the parameter for (b) Note that the issue doesn't cause any visible impacts due to the safety gap introduced by PGTY_mapcount_underflow limit. So the tag 'Cc: stable@vger.kernel.org' isn't needed. This patch (of 2): As the comments of page_mapcount_is_type() indicate, the parameter passed to the function should be one more than page->_mapcount. However, page->_mapcount (equivalent to page->page_type) is passed to the function by commit 4ffca5a96678 ("mm: support only one page_type per page") page_type_has_type() is replaced by page_mapcount_is_type(), but the parameter isn't adjusted. Fix it by replacing page_mapcount_is_type() with page_type_has_type() in page_has_type(). Note that the issue doesn't cause any visible impacts due to the safety gap introduced by PGTY_mapcount_underflow limit. Link: https://lkml.kernel.org/r/20250321120222.1456770-1-gshan@redhat.com Link: https://lkml.kernel.org/r/20250321120222.1456770-2-gshan@redhat.com Fixes: 4ffca5a96678 ("mm: support only one page_type per page") Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: gehao Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e6a21b62dcce..d3909cb1e576 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -982,7 +982,7 @@ static inline bool page_mapcount_is_type(unsigned int mapcount) static inline bool page_has_type(const struct page *page) { - return page_mapcount_is_type(data_race(page->page_type)); + return page_type_has_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ -- cgit v1.2.3 From 8a5577428e8e586a107ee5f39eb4c86d32c971cd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Mar 2025 12:37:12 +0100 Subject: kernel/events/uprobes: pass VMA to set_swbp(), set_orig_insn() and uprobe_write_opcode() We already have the VMA, no need to look it up using get_user_page_vma_remote(). We can now switch to get_user_pages_remote(). Link: https://lkml.kernel.org/r/20250321113713.204682-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andrii Nakryiko Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcox (Oracle) Cc: Namhyung kim Cc: Russel King Cc: tongtiangen Signed-off-by: Andrew Morton --- include/linux/uprobes.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 2e46b69ff0a6..516217c39094 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -188,13 +188,13 @@ struct uprobes_state { }; extern void __init uprobes_init(void); -extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); +extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); -- cgit v1.2.3 From e064e7384f991c7df81999cad4ce30fed7ef7d88 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 7 Apr 2025 11:01:11 +0530 Subject: mm/ptdump: split note_page() into level specific callbacks Patch series "mm/ptdump: Drop assumption that pxd_val() is u64", v2. Last argument passed down in note_page() is u64 assuming pxd_val() returned value (all page table levels) is 64 bit - which might not be the case going ahead when D128 page tables is enabled on arm64 platform. Besides pxd_val() is very platform specific and its type should not be assumed in generic MM. A similar problem exists for effective_prot(), although it is restricted to x86 platform. This series splits note_page() and effective_prot() into individual page table level specific callbacks which accepts corresponding pxd_t page table entry as an argument instead and later on all subscribing platforms could derive pxd_val() from the table entries as required and proceed as before. Define ptdesc_t type which describes the basic page table descriptor layout on arm64 platform. Subsequently all level specific pxxval_t descriptors are derived from ptdesc_t thus establishing a common original format, which can also be appropriate for page table entries, masks and protection values etc which are used at all page table levels. This patch (of 3): Last argument passed down in note_page() is u64 assuming pxd_val() returned value (all page table levels) is 64 bit - which might not be the case going ahead when D128 page tables is enabled on arm64 platform. Besides pxd_val() is very platform specific and its type should not be assumed in generic MM. Split note_page() into individual page table level specific callbacks which accepts corresponding pxd_t argument instead and then subscribing platforms just derive pxd_val() from the entries as required and proceed as earlier. Also add a note_page_flush() callback for flushing the last page table page that was being handled earlier via level = -1. Link: https://lkml.kernel.org/r/20250407053113.746295-1-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/20250407053113.746295-2-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Catalin Marinas Cc: Will Deacon Cc: Madhavan Srinivasan Cc: Nicholas Piggin Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Ard Biesheuvel Cc: Dave Hansen Cc: Mark Rutland Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/ptdump.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index 8dbd51ea8626..1c1eb1fae199 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -11,9 +11,12 @@ struct ptdump_range { }; struct ptdump_state { - /* level is 0:PGD to 4:PTE, or -1 if unknown */ - void (*note_page)(struct ptdump_state *st, unsigned long addr, - int level, u64 val); + void (*note_page_pte)(struct ptdump_state *st, unsigned long addr, pte_t pte); + void (*note_page_pmd)(struct ptdump_state *st, unsigned long addr, pmd_t pmd); + void (*note_page_pud)(struct ptdump_state *st, unsigned long addr, pud_t pud); + void (*note_page_p4d)(struct ptdump_state *st, unsigned long addr, p4d_t p4d); + void (*note_page_pgd)(struct ptdump_state *st, unsigned long addr, pgd_t pgd); + void (*note_page_flush)(struct ptdump_state *st); void (*effective_prot)(struct ptdump_state *st, int level, u64 val); const struct ptdump_range *range; }; -- cgit v1.2.3 From 08978fc3b0d5709583f6dc2072a8607d0b527860 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 7 Apr 2025 11:01:12 +0530 Subject: mm/ptdump: split effective_prot() into level specific callbacks Last argument in effective_prot() is u64 assuming pxd_val() returned value (all page table levels) is 64 bit. pxd_val() is very platform specific and its type should not be assumed in generic MM. Split effective_prot() into individual page table level specific callbacks which accepts corresponding pxd_t argument instead and then the subscribing platform (only x86) just derive pxd_val() from the entries as required and proceed as earlier. Link: https://lkml.kernel.org/r/20250407053113.746295-3-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Nicholas Piggin Cc: Ryan Roberts Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/ptdump.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index 1c1eb1fae199..240bd3bff18d 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -17,7 +17,11 @@ struct ptdump_state { void (*note_page_p4d)(struct ptdump_state *st, unsigned long addr, p4d_t p4d); void (*note_page_pgd)(struct ptdump_state *st, unsigned long addr, pgd_t pgd); void (*note_page_flush)(struct ptdump_state *st); - void (*effective_prot)(struct ptdump_state *st, int level, u64 val); + void (*effective_prot_pte)(struct ptdump_state *st, pte_t pte); + void (*effective_prot_pmd)(struct ptdump_state *st, pmd_t pmd); + void (*effective_prot_pud)(struct ptdump_state *st, pud_t pud); + void (*effective_prot_p4d)(struct ptdump_state *st, p4d_t p4d); + void (*effective_prot_pgd)(struct ptdump_state *st, pgd_t pgd); const struct ptdump_range *range; }; -- cgit v1.2.3 From 4c97a17a252bf8396f7bd65efced00bf401a8c25 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Thu, 20 Mar 2025 11:22:19 +0100 Subject: xarray: make xa_alloc_cyclic() return 0 on all success cases Change xa_alloc_cyclic() to return 0 even on wrap-around. Do the same for xa_alloc_cyclic_irq() and xa_alloc_cyclic_bh(). This will prevent any future bug of treating return of 1 as an error: int ret = xa_alloc_cyclic(...) if (ret) // currently mishandles ret==1 goto failure; If there will be someone interested in when wrap-around occurs, there is still __xa_alloc_cyclic() that behaves as before. For now there is no such user. Link: https://lkml.kernel.org/r/20250320102219.8101-1-przemyslaw.kitszel@intel.com Signed-off-by: Przemek Kitszel Suggested-by: Matthew Wilcox Link: https://lore.kernel.org/netdev/Z9gUd-5t8b5NX2wE@casper.infradead.org Cc: Andriy Shevchenko Cc: Dave Hansen Cc: Michal Swiatkowski Cc: Przemek Kitszel Signed-off-by: Andrew Morton --- include/linux/xarray.h | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 78eede109b1a..be850174e802 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -965,10 +965,12 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * + * Note that callers interested in whether wrapping has occurred should + * use __xa_alloc_cyclic() instead. + * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. - * Return: 0 if the allocation succeeded without wrapping. 1 if the - * allocation succeeded after wrapping, -ENOMEM if memory could not be + * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, @@ -981,7 +983,7 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); - return err; + return err < 0 ? err : 0; } /** @@ -1002,10 +1004,12 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * + * Note that callers interested in whether wrapping has occurred should + * use __xa_alloc_cyclic() instead. + * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. - * Return: 0 if the allocation succeeded without wrapping. 1 if the - * allocation succeeded after wrapping, -ENOMEM if memory could not be + * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, @@ -1018,7 +1022,7 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); - return err; + return err < 0 ? err : 0; } /** @@ -1039,10 +1043,12 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * + * Note that callers interested in whether wrapping has occurred should + * use __xa_alloc_cyclic() instead. + * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. - * Return: 0 if the allocation succeeded without wrapping. 1 if the - * allocation succeeded after wrapping, -ENOMEM if memory could not be + * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, @@ -1055,7 +1061,7 @@ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); - return err; + return err < 0 ? err : 0; } /** -- cgit v1.2.3 From a40b3fa844b4c29276a228ce878bd427a3149d37 Mon Sep 17 00:00:00 2001 From: Liu Ye Date: Tue, 18 Mar 2025 14:32:26 +0800 Subject: fs/proc/page: refactor to reduce code duplication kpageflags_read() and kpagecgroup_read() are quite similar to kpagecount_read(). Refactor common code into a helper function to reduce code duplication. Link: https://lkml.kernel.org/r/20250318063226.223284-1-liuyerd@163.com Signed-off-by: Liu Ye Acked-by: David Hildenbrand Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Ran Xiaokai Cc: Roman Gushchin Cc: Shakeel Butt Cc: Svetly Todorov Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 53364526d877..5264d148bdd9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1793,6 +1793,10 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, { } +static inline ino_t page_cgroup_ino(struct page *page) +{ + return 0; +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) -- cgit v1.2.3 From d82d3bf4115217bb3f43f9320bad5d68a35c278f Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:11 +0100 Subject: mm: pass mm down to pagetable_{pte,pmd}_ctor Patch series "Always call constructor for kernel page tables", v2. There has been much confusion around exactly when page table constructors/destructors (pagetable_*_[cd]tor) are supposed to be called. They were initially introduced for user PTEs only (to support split page table locks), then at the PMD level for the same purpose. Accounting was added later on, starting at the PTE level and then moving to higher levels (PMD, PUD). Finally, with my earlier series "Account page tables at all levels" [1], the ctor/dtor is run for all levels, all the way to PGD. I thought this was the end of the story, and it hopefully is for user pgtables, but I was wrong for what concerns kernel pgtables. The current situation there makes very little sense: * At the PTE level, the ctor/dtor is not called (at least in the generic implementation). Specific helpers are used for kernel pgtables at this level (pte_{alloc,free}_kernel()) and those have never called the ctor/dtor, most likely because they were initially irrelevant in the kernel case. * At all other levels, the ctor/dtor is normally called. This is potentially wasteful at the PMD level (more on that later). This series aims to ensure that the ctor/dtor is always called for kernel pgtables, as it already is for user pgtables. Besides consistency, the main motivation is to guarantee that ctor/dtor hooks are systematically called; this makes it possible to insert hooks to protect page tables [2], for instance. There is however an extra challenge: split locks are not used for kernel pgtables, and it would therefore be wasteful to initialise them (ptlock_init()). It is worth clarifying exactly when split locks are used. They clearly are for user pgtables, but as illustrated in commit 61444cde9170 ("ARM: 8591/1: mm: use fully constructed struct pages for EFI pgd allocations"), they also are for special page tables like efi_mm. The one case where split locks are definitely unused is pgtables owned by init_mm; this is consistent with the behaviour of apply_to_pte_range(). The approach chosen in this series is therefore to pass the mm associated to the pgtables being constructed to pagetable_{pte,pmd}_ctor() (patch 1), and skip ptlock_init() if mm == &init_mm (patch 3 and 7). This makes it possible to call the PTE ctor/dtor from pte_{alloc,free}_kernel() without unintended consequences (patch 3). As a result the accounting functions are now called at all levels for kernel pgtables, and split locks are never initialised. In configurations where ptlocks are dynamically allocated (32-bit, PREEMPT_RT, etc.) and ARCH_ENABLE_SPLIT_PMD_PTLOCK is selected, this series results in the removal of a kmem_cache allocation for every kernel PMD. Additionally, for certain architectures that do not use such as s390, the same optimisation occurs at the PTE level. === Things get more complicated when it comes to special pgtable allocators (patch 8-12). All architectures need such allocators to create initial kernel pgtables; we are not concerned with those as the ctor cannot be called so early in the boot sequence. However, those allocators may also be used later in the boot sequence or during normal operations. There are two main use-cases: 1. Mapping EFI memory: efi_mm (arm, arm64, riscv) 2. arch_add_memory(): init_mm The ctor is already explicitly run (at the PTE/PMD level) in the first case, as required for pgtables that are not associated with init_mm. However the same allocators may also be used for the second use-case (or others), and this is where it gets messy. Patch 1 calls the ctor with NULL as mm in those situations, as the actual mm isn't available. Practically this means that ptlocks will be unconditionally initialised. This is fine on arm - create_mapping_late() is only used for the EFI mapping. On arm64, __create_pgd_mapping() is also used by arch_add_memory(); patch 8/9/11 ensure that ctors are called at all levels with the appropriate mm. The situation is similar on riscv, but propagating the mm down to the ctor would require significant refactoring. Since they are already called unconditionally, this series leaves riscv no worse off - patch 10 adds comments to clarify the situation. From a cursory look at other architectures implementing arch_add_memory(), s390 and x86 may also need a similar treatment to add constructor calls. This is to be taken care of in a future version or as a follow-up. === The complications in those special pgtable allocators beg the question: does it really make sense to treat efi_mm and init_mm differently in e.g. apply_to_pte_range()? Maybe what we really need is a way to tell if an mm corresponds to user memory or not, and never use split locks for non-user mm's. Feedback and suggestions welcome! This patch (of 12): In preparation for calling constructors for all kernel page tables while eliding unnecessary ptlock initialisation, let's pass down the associated mm to the PTE/PMD level ctors. (These are the two levels where ptlocks are used.) In most cases the mm is already around at the point of calling the ctor so we simply pass it down. This is however not the case for special page table allocators: * arch/arm/mm/mmu.c * arch/arm64/mm/mmu.c * arch/riscv/mm/init.c In those cases, the page tables being allocated are either for standard kernel memory (init_mm) or special page directories, which may not be associated to any mm. For now let's pass NULL as mm; this will be refined where possible in future patches. No functional change in this patch. Link: https://lore.kernel.org/linux-mm/20250103184415.2744423-1-kevin.brodsky@arm.com/ [1] Link: https://lore.kernel.org/linux-hardening/20250203101839.1223008-1-kevin.brodsky@arm.com/ [2] Link: https://lkml.kernel.org/r/20250408095222.860601-1-kevin.brodsky@arm.com Link: https://lkml.kernel.org/r/20250408095222.860601-2-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Alexander Gordeev [s390] Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Kevin Brodsky Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Yang Shi Cc: Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1690f21e7808..fa22b17e337e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3147,7 +3147,8 @@ static inline void pagetable_dtor_free(struct ptdesc *ptdesc) pagetable_free(ptdesc); } -static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) +static inline bool pagetable_pte_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) { if (!ptlock_init(ptdesc)) return false; @@ -3253,7 +3254,8 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } -static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) +static inline bool pagetable_pmd_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) { if (!pmd_ptlock_init(ptdesc)) return false; -- cgit v1.2.3 From 49f5996664201a3581ee5ea5949ee7d5fafb9d86 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:13 +0100 Subject: mm: call ctor/dtor for kernel PTEs Since [1], constructors/destructors are expected to be called for all page table pages, at all levels and for both user and kernel pgtables. There is however one glaring exception: kernel PTEs are managed via separate helpers (pte_alloc_kernel/pte_free_kernel), which do not call the [cd]tor, at least not in the generic implementation. The most obvious reason for this anomaly is that init_mm is special-cased not to use split page table locks. As a result calling ptlock_init() for PTEs associated with init_mm would be wasteful, potentially resulting in dynamic memory allocation. However, pgtable [cd]tors perform other actions - currently related to accounting/statistics, and potentially more functionally significant in the future. Now that pagetable_pte_ctor() is passed the associated mm, we can make it skip the call to ptlock_init() for init_mm; this allows us to call the ctor from pte_alloc_one_kernel() too. This is matched by a call to the pgtable destructor in pte_free_kernel(); no special-casing is needed on that path, as ptlock_free() is already called unconditionally. (ptlock_free() is a no-op unless a ptlock was allocated for the given PTP.) This patch ensures that all architectures that rely on call the [cd]tor for kernel PTEs. pte_free_kernel() cannot be overridden so changing the generic implementation is sufficient. pte_alloc_one_kernel() can be overridden using __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL, and a few architectures implement it by calling the page allocator directly. We amend those so that they call the generic __pte_alloc_one_kernel() instead, if possible, ensuring that the ctor is called. A few architectures do not use ; those will be taken care of separately. [1] https://lore.kernel.org/linux-mm/20250103184415.2744423-1-kevin.brodsky@arm.com/ Link: https://lkml.kernel.org/r/20250408095222.860601-4-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Reviewed-by: Alexander Gordeev # s390 Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fa22b17e337e..ce6832787f6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3150,7 +3150,7 @@ static inline void pagetable_dtor_free(struct ptdesc *ptdesc) static inline bool pagetable_pte_ctor(struct mm_struct *mm, struct ptdesc *ptdesc) { - if (!ptlock_init(ptdesc)) + if (mm != &init_mm && !ptlock_init(ptdesc)) return false; __pagetable_ctor(ptdesc); return true; -- cgit v1.2.3 From 8240d8d3c5fb65fbdace7ae4db909b51f2253da7 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:17 +0100 Subject: mm: skip ptlock_init() for kernel PMDs Split page table locks are not used for pgtables associated to init_mm, at any level. pte_alloc_kernel() does not call ptlock_init() as a result. There is however no separate alloc/free functions for kernel PMDs, and pmd_ptlock_init() is called unconditionally. When ALLOC_SPLIT_PTLOCKS is true (e.g. 32-bit architectures or if CONFIG_PREEMPT_RT is selected), this results in unnecessary dynamic memory allocation every time a kernel PMD is allocated. Now that pagetable_pmd_ctor() is passed the associated mm, we can easily remove this overhead by skipping pmd_ptlock_init() if the pgtable is associated to init_mm. No special-casing is needed on the dtor path, as ptlock_free() is already called unconditionally for all levels. (ptlock_free() is a no-op unless a ptlock was allocated for the given PTP.) Link: https://lkml.kernel.org/r/20250408095222.860601-8-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ce6832787f6d..5eb0d77c4438 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3257,7 +3257,7 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pagetable_pmd_ctor(struct mm_struct *mm, struct ptdesc *ptdesc) { - if (!pmd_ptlock_init(ptdesc)) + if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); __pagetable_ctor(ptdesc); -- cgit v1.2.3 From ad88fc17d2dafe45e40de2af80207f4b2e3b1f71 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 10 Apr 2025 19:14:43 +0000 Subject: maple_tree: use vacant nodes to reduce worst case allocations In order to determine the store type for a maple tree operation, a walk of the tree is done through mas_wr_walk(). This function descends the tree until a spanning write is detected or we reach a leaf node. While descending, keep track of the height at which we encounter a node with available space. This is done by checking if mas->end is less than the number of slots a given node type can fit. Now that the height of the vacant node is tracked, we can use the difference between the height of the tree and the height of the vacant node to know how many levels we will have to propagate creating new nodes. Update mas_prealloc_calc() to consider the vacant height and reduce the number of worst-case allocations. Rebalancing and spanning stores are not supported and fall back to using the full height of the tree for allocations. Update preallocation testing assertions to take into account vacant height. Link: https://lkml.kernel.org/r/20250410191446.2474640-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index cbbcd18d4186..657adb33e61e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -463,6 +463,7 @@ struct ma_wr_state { void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ + unsigned char vacant_height; /* Height of lowest node with free space */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) @@ -498,6 +499,7 @@ struct ma_wr_state { .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ + .vacant_height = 0 \ } #define MA_TOPIARY(name, tree) \ -- cgit v1.2.3 From 271152a973cb01c135d29e91d1a05f51fbd88a9c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 10 Apr 2025 19:14:45 +0000 Subject: maple_tree: add sufficient height In order to support rebalancing and spanning stores using less than the worst case number of nodes, we need to track more than just the vacant height. Using only vacant height to reduce the worst case maple node allocation count can lead to a shortcoming of nodes in the following scenarios. For rebalancing writes, when a leaf node becomes insufficient, it may be combined with a sibling into a single node. This means that the parent node which has entries for this children will lose one entry. If this parent node was just meeting the minimum entries, losing one entry will now cause this parent node to be insufficient. This leads to a cascading operation of rebalancing at different levels and can lead to more node allocations than simply using vacant height can return. For spanning writes, a similar situation occurs. At the location at which a spanning write is detected, the number of ancestor nodes may similarly need to rebalanced into a smaller number of nodes and the same cascading situation could occur. To use less than the full height of the tree for the number of allocations, we also need to track the height at which a non-leaf node cannot become insufficient. This means even if a rebalance occurs to a child of this node, it currently has enough entries that it can lose one without any further action. This field is stored in the maple write state as sufficient height. In mas_prealloc_calc() when figuring out how many nodes to allocate, we check if the vacant node is lower in the tree than a sufficient node (has a larger value). If it is, we cannot use the vacant height and must use the difference in the height and sufficient height as the basis for the number of nodes needed. An off by one bug was also discovered in mast_overflow() where it is using >= rather than >. This caused extra iterations of the mas_spanning_rebalance() loop and lead to unneeded allocations. A test is also added to check the number of allocations is correct. Link: https://lkml.kernel.org/r/20250410191446.2474640-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 657adb33e61e..9ef129038224 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -464,6 +464,7 @@ struct ma_wr_state { void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ unsigned char vacant_height; /* Height of lowest node with free space */ + unsigned char sufficient_height;/* Height of lowest node with min sufficiency + 1 nodes */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) @@ -499,7 +500,8 @@ struct ma_wr_state { .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ - .vacant_height = 0 \ + .vacant_height = 0, \ + .sufficient_height = 0 \ } #define MA_TOPIARY(name, tree) \ -- cgit v1.2.3 From 06340b927051bf71b59a9cd4cff3417247318251 Mon Sep 17 00:00:00 2001 From: Fan Ni Date: Wed, 16 Apr 2025 13:12:15 -0700 Subject: mm: convert free_page_and_swap_cache() to free_folio_and_swap_cache() free_page_and_swap_cache() takes a struct page pointer as input parameter, but it will immediately convert it to folio and all operations following within use folio instead of page. It makes more sense to pass in folio directly. Convert free_page_and_swap_cache() to free_folio_and_swap_cache() to consume folio directly. Link: https://lkml.kernel.org/r/20250416201720.41678-1-nifan.cxl@gmail.com Signed-off-by: Fan Ni Acked-by: Davidlohr Bueso Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Adam Manzanares Cc: "Aneesh Kumar K.V" Cc: Heiko Carstens Cc: Luis Chamberalin Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/swap.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index db46b25a65ae..4e4e27d3ce3d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -450,7 +450,7 @@ static inline unsigned long total_swapcache_pages(void) } void free_swap_cache(struct folio *folio); -void free_page_and_swap_cache(struct page *); +void free_folio_and_swap_cache(struct folio *folio); void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; @@ -520,10 +520,8 @@ static inline void put_swap_device(struct swap_info_struct *si) #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) -/* only sparc can not include linux/pagemap.h in this file - * so leave put_page and release_pages undeclared... */ -#define free_page_and_swap_cache(page) \ - put_page(page) +#define free_folio_and_swap_cache(folio) \ + folio_put(folio) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -- cgit v1.2.3 From 75404e07663b1622948944cf31531fa87cb1785d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 16 Apr 2025 11:38:36 +0100 Subject: mm: move mmap/vma locking logic into specific files Currently the VMA and mmap locking logic is entangled in two of the most overwrought files in mm - include/linux/mm.h and mm/memory.c. Separate this logic out so we can more easily make changes and create an appropriate MAINTAINERS entry that spans only the logic relating to locking. This should have no functional change. Care is taken to avoid dependency loops, we must regrettably keep release_fault_lock() and assert_fault_locked() in mm.h as a result due to the dependence on the vm_fault type. Additionally we must declare rcuwait_wake_up() manually to avoid a dependency cycle on linux/rcuwait.h. Additionally move the nommu implementatino of lock_mm_and_find_vma() to mmap_lock.c so everything lock-related is in one place. Link: https://lkml.kernel.org/r/bec6c8e29fa8de9267a811a10b1bdae355d67ed4.1744799282.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: "Paul E . McKenney" Cc: SeongJae Park Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mm.h | 231 +--------------------------------------------- include/linux/mmap_lock.h | 227 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+), 227 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5eb0d77c4438..9b701cfbef22 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -671,204 +671,11 @@ static inline void vma_numab_state_init(struct vm_area_struct *vma) {} static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_PER_VMA_LOCK -static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - static struct lock_class_key lockdep_key; - - lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); -#endif - if (reset_refcnt) - refcount_set(&vma->vm_refcnt, 0); - vma->vm_lock_seq = UINT_MAX; -} - -static inline bool is_vma_writer_only(int refcnt) -{ - /* - * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma - * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on - * a detached vma happens only in vma_mark_detached() and is a rare - * case, therefore most of the time there will be no unnecessary wakeup. - */ - return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; -} - -static inline void vma_refcount_put(struct vm_area_struct *vma) -{ - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ - struct mm_struct *mm = vma->vm_mm; - int oldcnt; - - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); - if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { - - if (is_vma_writer_only(oldcnt - 1)) - rcuwait_wake_up(&mm->vma_writer_wait); - } -} - -/* - * Try to read-lock a vma. The function is allowed to occasionally yield false - * locked result to avoid performance overhead, in which case we fall back to - * using mmap_lock. The function should never yield false unlocked result. - * False locked result is possible if mm_lock_seq overflows or if vma gets - * reused and attached to a different mm before we lock it. - * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got - * detached. - */ -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - int oldcnt; - - /* - * Check before locking. A race might cause false locked result. - * We can use READ_ONCE() for the mm_lock_seq here, and don't need - * ACQUIRE semantics, because this is just a lockless check whose result - * we don't rely on for anything - the mm_lock_seq read against which we - * need ordering is below. - */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) - return NULL; - - /* - * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() - * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. - * Acquire fence is required here to avoid reordering against later - * vm_lock_seq check and checks inside lock_vma_under_rcu(). - */ - if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) { - /* return EAGAIN if vma got detached from under us */ - return oldcnt ? NULL : ERR_PTR(-EAGAIN); - } - - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); - /* - * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. - * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq - * modification invalidates all existing locks. - * - * We must use ACQUIRE semantics for the mm_lock_seq so that if we are - * racing with vma_end_write_all(), we only start reading from the VMA - * after it has been unlocked. - * This pairs with RELEASE semantics in vma_end_write_all(). - */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { - vma_refcount_put(vma); - return NULL; - } - - return vma; -} - -/* - * Use only while holding mmap read lock which guarantees that locking will not - * fail (nobody can concurrently write-lock the vma). vma_start_read() should - * not be used in such cases because it might fail due to mm_lock_seq overflow. - * This functionality is used to obtain vma read lock and drop the mmap read lock. - */ -static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) -{ - int oldcnt; - - mmap_assert_locked(vma->vm_mm); - if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) - return false; - - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); - return true; -} - -/* - * Use only while holding mmap read lock which guarantees that locking will not - * fail (nobody can concurrently write-lock the vma). vma_start_read() should - * not be used in such cases because it might fail due to mm_lock_seq overflow. - * This functionality is used to obtain vma read lock and drop the mmap read lock. - */ -static inline bool vma_start_read_locked(struct vm_area_struct *vma) -{ - return vma_start_read_locked_nested(vma, 0); -} - -static inline void vma_end_read(struct vm_area_struct *vma) -{ - vma_refcount_put(vma); -} - -/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) -{ - mmap_assert_write_locked(vma->vm_mm); - - /* - * current task is holding mmap_write_lock, both vma->vm_lock_seq and - * mm->mm_lock_seq can't be concurrently modified. - */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; - return (vma->vm_lock_seq == *mm_lock_seq); -} - -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); - -/* - * Begin writing to a VMA. - * Exclude concurrent readers under the per-VMA lock until the currently - * write-locked mmap_lock is dropped or downgraded. - */ -static inline void vma_start_write(struct vm_area_struct *vma) -{ - unsigned int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) - return; - - __vma_start_write(vma, mm_lock_seq); -} - -static inline void vma_assert_write_locked(struct vm_area_struct *vma) -{ - unsigned int mm_lock_seq; - - VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); -} - -static inline void vma_assert_locked(struct vm_area_struct *vma) -{ - unsigned int mm_lock_seq; - - VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && - !__is_vma_write_locked(vma, &mm_lock_seq), vma); -} - /* - * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these - * assertions should be made either under mmap_write_lock or when the object - * has been isolated under mmap_write_lock, ensuring no competing writers. + * These must be here rather than mmap_lock.h as dependent on vm_fault type, + * declared in this header. */ -static inline void vma_assert_attached(struct vm_area_struct *vma) -{ - WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); -} - -static inline void vma_assert_detached(struct vm_area_struct *vma) -{ - WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); -} - -static inline void vma_mark_attached(struct vm_area_struct *vma) -{ - vma_assert_write_locked(vma); - vma_assert_detached(vma); - refcount_set_release(&vma->vm_refcnt, 1); -} - -void vma_mark_detached(struct vm_area_struct *vma); - +#ifdef CONFIG_PER_VMA_LOCK static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) @@ -884,36 +691,7 @@ static inline void assert_fault_locked(struct vm_fault *vmf) else mmap_assert_locked(vmf->vma->vm_mm); } - -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address); - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) - { return NULL; } -static inline void vma_end_read(struct vm_area_struct *vma) {} -static inline void vma_start_write(struct vm_area_struct *vma) {} -static inline void vma_assert_write_locked(struct vm_area_struct *vma) - { mmap_assert_write_locked(vma->vm_mm); } -static inline void vma_assert_attached(struct vm_area_struct *vma) {} -static inline void vma_assert_detached(struct vm_area_struct *vma) {} -static inline void vma_mark_attached(struct vm_area_struct *vma) {} -static inline void vma_mark_detached(struct vm_area_struct *vma) {} - -static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address) -{ - return NULL; -} - -static inline void vma_assert_locked(struct vm_area_struct *vma) -{ - mmap_assert_locked(vma->vm_mm); -} - +#else static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); @@ -923,7 +701,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } - #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 4706c6769902..7983b2efe9bf 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -1,6 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H +/* Avoid a dependency loop by declaring here. */ +extern int rcuwait_wake_up(struct rcuwait *w); + #include #include #include @@ -104,6 +108,206 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return read_seqcount_retry(&mm->mm_lock_seq, seq); } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key lockdep_key; + + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); +#endif + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); + vma->vm_lock_seq = UINT_MAX; +} + +static inline bool is_vma_writer_only(int refcnt) +{ + /* + * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma + * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on + * a detached vma happens only in vma_mark_detached() and is a rare + * case, therefore most of the time there will be no unnecessary wakeup. + */ + return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; +} + +static inline void vma_refcount_put(struct vm_area_struct *vma) +{ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *mm = vma->vm_mm; + int oldcnt; + + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { + + if (is_vma_writer_only(oldcnt - 1)) + rcuwait_wake_up(&mm->vma_writer_wait); + } +} + +/* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. + */ +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + int oldcnt; + + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) + return NULL; + + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + return oldcnt ? NULL : ERR_PTR(-EAGAIN); + } + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq + * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). + */ + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); + return NULL; + } + + return vma; +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +{ + int oldcnt; + + mmap_assert_locked(vma->vm_mm); + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + return true; +} + +/* + * Use only while holding mmap read lock which guarantees that locking will not + * fail (nobody can concurrently write-lock the vma). vma_start_read() should + * not be used in such cases because it might fail due to mm_lock_seq overflow. + * This functionality is used to obtain vma read lock and drop the mmap read lock. + */ +static inline bool vma_start_read_locked(struct vm_area_struct *vma) +{ + return vma_start_read_locked_nested(vma, 0); +} + +static inline void vma_end_read(struct vm_area_struct *vma) +{ + vma_refcount_put(vma); +} + +/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +{ + mmap_assert_write_locked(vma->vm_mm); + + /* + * current task is holding mmap_write_lock, both vma->vm_lock_seq and + * mm->mm_lock_seq can't be concurrently modified. + */ + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; + return (vma->vm_lock_seq == *mm_lock_seq); +} + +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); + +/* + * Begin writing to a VMA. + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + */ +static inline void vma_start_write(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + if (__is_vma_write_locked(vma, &mm_lock_seq)) + return; + + __vma_start_write(vma, mm_lock_seq); +} + +static inline void vma_assert_write_locked(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); +} + +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && + !__is_vma_write_locked(vma, &mm_lock_seq), vma); +} + +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ +static inline void vma_assert_attached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); +} + +static inline void vma_assert_detached(struct vm_area_struct *vma) +{ + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); +} + +static inline void vma_mark_attached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set_release(&vma->vm_refcnt, 1); +} + +void vma_mark_detached(struct vm_area_struct *vma); + +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address); + #else /* CONFIG_PER_VMA_LOCK */ static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} @@ -119,6 +323,29 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int { return true; } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) + { return NULL; } +static inline void vma_end_read(struct vm_area_struct *vma) {} +static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline void vma_assert_write_locked(struct vm_area_struct *vma) + { mmap_assert_write_locked(vma->vm_mm); } +static inline void vma_assert_attached(struct vm_area_struct *vma) {} +static inline void vma_assert_detached(struct vm_area_struct *vma) {} +static inline void vma_mark_attached(struct vm_area_struct *vma) {} +static inline void vma_mark_detached(struct vm_area_struct *vma) {} + +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + return NULL; +} + +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); +} #endif /* CONFIG_PER_VMA_LOCK */ -- cgit v1.2.3 From db80bd2cea1b7c2574578461c9de4c3d9ee7634a Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Apr 2025 14:33:03 +0200 Subject: task_stack.h: remove obsolete __HAVE_ARCH_KSTACK_END check Remove __HAVE_ARCH_KSTACK_END as it has been obsolete since removal of metag architecture in v4.17. Link: https://lkml.kernel.org/r/20250403-kstack-end-v1-1-7798e71f34d1@linaro.org Signed-off-by: Pasha Tatashin Link: https://lore.kernel.org/20240311164638.2015063-2-pasha.tatashin@soleen.com Signed-off-by: Linus Walleij Signed-off-by: Andrew Morton --- include/linux/sched/task_stack.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index cffad65bdc6a..85c5a6392e02 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -106,7 +106,6 @@ static inline unsigned long stack_not_used(struct task_struct *p) #endif extern void set_task_stack_end_magic(struct task_struct *tsk); -#ifndef __HAVE_ARCH_KSTACK_END static inline int kstack_end(void *addr) { /* Reliable end of stack detection: @@ -114,6 +113,5 @@ static inline int kstack_end(void *addr) */ return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); } -#endif #endif /* _LINUX_SCHED_TASK_STACK_H */ -- cgit v1.2.3 From 4ef5211ee68113070bd42142f06347866675055e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 24 Mar 2025 12:50:24 +0200 Subject: kernel.h: move READ/WRITE definitions to Patch series "kernel.h: Move out a couple of macros and constants". kernel.h hosts a couple of macros and constants that may be better placed. Do that. Also add missing documentation. No functional changes intended. This patch (of 2): Headers shouldn't be forced to include just to gain these simple constants. Link: https://lkml.kernel.org/r/20250324105228.775784-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20250324105228.775784-2-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Andy Shevchenko Cc: Alexandru Ardelean Signed-off-by: Andrew Morton --- include/linux/kernel.h | 4 ---- include/linux/types.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index be2e8c0a187e..01bb0fac3667 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -41,10 +41,6 @@ #define STACK_MAGIC 0xdeadbeef -/* generic data direction definitions */ -#define READ 0 -#define WRITE 1 - #define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) #define u64_to_user_ptr(x) ( \ diff --git a/include/linux/types.h b/include/linux/types.h index 49b79c8bb1a9..6dfdb8e8e4c3 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -136,6 +136,10 @@ typedef s64 ktime_t; typedef u64 sector_t; typedef u64 blkcnt_t; +/* generic data direction definitions */ +#define READ 0 +#define WRITE 1 + /* * The type of an index into the pagecache. */ -- cgit v1.2.3 From 029c896c4105b7d74fcd88d99c5fbab2558fee89 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Mar 2025 12:50:25 +0200 Subject: kernel.h: move PTR_IF() and u64_to_user_ptr() to util_macros.h While the natural choice of PTR_IF() is kconfig.h, the latter is too broad to include C code and actually the macro was moved out from there in the past. But kernel.h is neither a good choice for that. Move it to util_macros.h. Do the same for u64_to_user_ptr(). While moving, add necessary documentation. Link: https://lkml.kernel.org/r/20250324105228.775784-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alexandru Ardelean Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/kernel.h | 10 +------ include/linux/util_macros.h | 66 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 01bb0fac3667..1cce1f6410a9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -41,15 +42,6 @@ #define STACK_MAGIC 0xdeadbeef -#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) - -#define u64_to_user_ptr(x) ( \ -{ \ - typecheck(u64, (x)); \ - (void __user *)(uintptr_t)(x); \ -} \ -) - struct completion; struct user; diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 3b570b765b75..7b64fb597f85 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -79,6 +79,72 @@ (__fc_i); \ }) +/** + * PTR_IF - evaluate to @ptr if @cond is true, or to NULL otherwise. + * @cond: A conditional, usually in a form of IS_ENABLED(CONFIG_FOO) + * @ptr: A pointer to assign if @cond is true. + * + * PTR_IF(IS_ENABLED(CONFIG_FOO), ptr) evaluates to @ptr if CONFIG_FOO is set + * to 'y' or 'm', or to NULL otherwise. The (ptr) argument must be a pointer. + * + * The macro can be very useful to help compiler dropping dead code. + * + * For instance, consider the following:: + * + * #ifdef CONFIG_FOO_SUSPEND + * static int foo_suspend(struct device *dev) + * { + * ... + * } + * #endif + * + * static struct pm_ops foo_ops = { + * #ifdef CONFIG_FOO_SUSPEND + * .suspend = foo_suspend, + * #endif + * }; + * + * While this works, the foo_suspend() macro is compiled conditionally, + * only when CONFIG_FOO_SUSPEND is set. This is problematic, as there could + * be a build bug in this function, we wouldn't have a way to know unless + * the configuration option is set. + * + * An alternative is to declare foo_suspend() always, but mark it + * as __maybe_unused. This works, but the __maybe_unused attribute + * is required to instruct the compiler that the function may not + * be referenced anywhere, and is safe to remove without making + * a fuss about it. This makes the programmer responsible for tagging + * the functions that can be garbage-collected. + * + * With the macro it is possible to write the following: + * + * static int foo_suspend(struct device *dev) + * { + * ... + * } + * + * static struct pm_ops foo_ops = { + * .suspend = PTR_IF(IS_ENABLED(CONFIG_FOO_SUSPEND), foo_suspend), + * }; + * + * The foo_suspend() function will now be automatically dropped by the + * compiler, and it does not require any specific attribute. + */ +#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) + +/** + * to_user_ptr - cast a pointer passed as u64 from user space to void __user * + * @x: The u64 value from user space, usually via IOCTL + * + * to_user_ptr() simply casts a pointer passed as u64 from user space to void + * __user * correctly. Using this lets us get rid of all the tiresome casts. + */ +#define u64_to_user_ptr(x) \ +({ \ + typecheck(u64, (x)); \ + (void __user *)(uintptr_t)(x); \ +}) + /** * is_insidevar - check if the @ptr points inside the @var memory range. * @ptr: the pointer to a memory address. -- cgit v1.2.3 From e711faaafbe54a884f33b53472434063d342f6d4 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 14 Apr 2025 22:59:43 +0800 Subject: hung_task: replace blocker_mutex with encoded blocker Patch series "hung_task: extend blocking task stacktrace dump to semaphore", v5. Inspired by mutex blocker tracking[1], this patch series extend the feature to not only dump the blocker task holding a mutex but also to support semaphores. Unlike mutexes, semaphores lack explicit ownership tracking, making it challenging to identify the root cause of hangs. To address this, we introduce a last_holder field to the semaphore structure, which is updated when a task successfully calls down() and cleared during up(). The assumption is that if a task is blocked on a semaphore, the holders must not have released it. While this does not guarantee that the last holder is one of the current blockers, it likely provides a practical hint for diagnosing semaphore-related stalls. With this change, the hung task detector can now show blocker task's info like below: [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked for more than 120 seconds. [Tue Apr 8 12:19:07 2025] Tainted: G E 6.14.0-rc6+ #1 [Tue Apr 8 12:19:07 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Tue Apr 8 12:19:07 2025] task:cat state:D stack:0 pid:945 tgid:945 ppid:828 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0xe3/0xf0 [Tue Apr 8 12:19:07 2025] ? __folio_mod_stat+0x2a/0x80 [Tue Apr 8 12:19:07 2025] ? set_ptes.constprop.0+0x27/0x90 [Tue Apr 8 12:19:07 2025] __down_common+0x155/0x280 [Tue Apr 8 12:19:07 2025] down+0x53/0x70 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x23/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f419478f46e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007fff1c4d2668 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f419478f46e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f4194683000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f4194683000 R08: 00007f4194682010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked on a semaphore likely last held by task cat:938 [Tue Apr 8 12:19:07 2025] task:cat state:S stack:0 pid:938 tgid:938 ppid:584 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] ? _raw_spin_unlock_irqrestore+0xe/0x40 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0x77/0xf0 [Tue Apr 8 12:19:07 2025] ? __pfx_process_timeout+0x10/0x10 [Tue Apr 8 12:19:07 2025] msleep_interruptible+0x49/0x60 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x2d/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f7c584a646e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007ffdba8ce158 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f7c584a646e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f7c5839a000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f7c5839a000 R08: 00007f7c58399010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] This patch (of 3): This patch replaces 'struct mutex *blocker_mutex' with 'unsigned long blocker', as only one blocker is active at a time. The blocker filed can store both the lock addrees and the lock type, with LSB used to encode the type as Masami suggested, making it easier to extend the feature to cover other types of locks. Also, once the lock type is determined, we can directly extract the address and cast it to a lock pointer ;) Link: https://lkml.kernel.org/r/20250414145945.84916-1-ioworker0@gmail.com Link: https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com [1] Link: https://lkml.kernel.org/r/20250414145945.84916-2-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Reviewed-by: Masami Hiramatsu (Google) Suggested-by: Andrew Morton Suggested-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- include/linux/hung_task.h | 99 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 6 ++- 2 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 include/linux/hung_task.h (limited to 'include/linux') diff --git a/include/linux/hung_task.h b/include/linux/hung_task.h new file mode 100644 index 000000000000..1bc2b3244613 --- /dev/null +++ b/include/linux/hung_task.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Detect Hung Task: detecting tasks stuck in D state + * + * Copyright (C) 2025 Tongcheng Travel (www.ly.com) + * Author: Lance Yang + */ +#ifndef __LINUX_HUNG_TASK_H +#define __LINUX_HUNG_TASK_H + +#include +#include +#include + +/* + * @blocker: Combines lock address and blocking type. + * + * Since lock pointers are at least 4-byte aligned(32-bit) or 8-byte + * aligned(64-bit). This leaves the 2 least bits (LSBs) of the pointer + * always zero. So we can use these bits to encode the specific blocking + * type. + * + * Type encoding: + * 00 - Blocked on mutex (BLOCKER_TYPE_MUTEX) + * 01 - Blocked on semaphore (BLOCKER_TYPE_SEM) + * 10 - Blocked on rt-mutex (BLOCKER_TYPE_RTMUTEX) + * 11 - Blocked on rw-semaphore (BLOCKER_TYPE_RWSEM) + */ +#define BLOCKER_TYPE_MUTEX 0x00UL +#define BLOCKER_TYPE_SEM 0x01UL +#define BLOCKER_TYPE_RTMUTEX 0x02UL +#define BLOCKER_TYPE_RWSEM 0x03UL + +#define BLOCKER_TYPE_MASK 0x03UL + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_set_blocker(void *lock, unsigned long type) +{ + unsigned long lock_ptr = (unsigned long)lock; + + WARN_ON_ONCE(!lock_ptr); + WARN_ON_ONCE(READ_ONCE(current->blocker)); + + /* + * If the lock pointer matches the BLOCKER_TYPE_MASK, return + * without writing anything. + */ + if (WARN_ON_ONCE(lock_ptr & BLOCKER_TYPE_MASK)) + return; + + WRITE_ONCE(current->blocker, lock_ptr | type); +} + +static inline void hung_task_clear_blocker(void) +{ + WARN_ON_ONCE(!READ_ONCE(current->blocker)); + + WRITE_ONCE(current->blocker, 0UL); +} + +/* + * hung_task_get_blocker_type - Extracts blocker type from encoded blocker + * address. + * + * @blocker: Blocker pointer with encoded type (via LSB bits) + * + * Returns: BLOCKER_TYPE_MUTEX, BLOCKER_TYPE_SEM, etc. + */ +static inline unsigned long hung_task_get_blocker_type(unsigned long blocker) +{ + WARN_ON_ONCE(!blocker); + + return blocker & BLOCKER_TYPE_MASK; +} + +static inline void *hung_task_blocker_to_lock(unsigned long blocker) +{ + WARN_ON_ONCE(!blocker); + + return (void *)(blocker & ~BLOCKER_TYPE_MASK); +} +#else +static inline void hung_task_set_blocker(void *lock, unsigned long type) +{ +} +static inline void hung_task_clear_blocker(void) +{ +} +static inline unsigned long hung_task_get_blocker_type(unsigned long blocker) +{ + return 0UL; +} +static inline void *hung_task_blocker_to_lock(unsigned long blocker) +{ + return NULL; +} +#endif + +#endif /* __LINUX_HUNG_TASK_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..393d81978f40 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1240,7 +1240,11 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - struct mutex *blocker_mutex; + /* + * Encoded lock address causing task block (lower 2 bits = type from + * ). Accessed via hung_task_*() helpers. + */ + unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -- cgit v1.2.3 From 194a9b9e843b4077033be70a07d191107972aa53 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 14 Apr 2025 22:59:44 +0800 Subject: hung_task: show the blocker task if the task is hung on semaphore Inspired by mutex blocker tracking[1], this patch makes a trade-off to balance the overhead and utility of the hung task detector. Unlike mutexes, semaphores lack explicit ownership tracking, making it challenging to identify the root cause of hangs. To address this, we introduce a last_holder field to the semaphore structure, which is updated when a task successfully calls down() and cleared during up(). The assumption is that if a task is blocked on a semaphore, the holders must not have released it. While this does not guarantee that the last holder is one of the current blockers, it likely provides a practical hint for diagnosing semaphore-related stalls. With this change, the hung task detector can now show blocker task's info like below: [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked for more than 120 seconds. [Tue Apr 8 12:19:07 2025] Tainted: G E 6.14.0-rc6+ #1 [Tue Apr 8 12:19:07 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Tue Apr 8 12:19:07 2025] task:cat state:D stack:0 pid:945 tgid:945 ppid:828 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0xe3/0xf0 [Tue Apr 8 12:19:07 2025] ? __folio_mod_stat+0x2a/0x80 [Tue Apr 8 12:19:07 2025] ? set_ptes.constprop.0+0x27/0x90 [Tue Apr 8 12:19:07 2025] __down_common+0x155/0x280 [Tue Apr 8 12:19:07 2025] down+0x53/0x70 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x23/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f419478f46e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007fff1c4d2668 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f419478f46e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f4194683000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f4194683000 R08: 00007f4194682010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked on a semaphore likely last held by task cat:938 [Tue Apr 8 12:19:07 2025] task:cat state:S stack:0 pid:938 tgid:938 ppid:584 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] ? _raw_spin_unlock_irqrestore+0xe/0x40 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0x77/0xf0 [Tue Apr 8 12:19:07 2025] ? __pfx_process_timeout+0x10/0x10 [Tue Apr 8 12:19:07 2025] msleep_interruptible+0x49/0x60 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x2d/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f7c584a646e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007ffdba8ce158 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f7c584a646e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f7c5839a000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f7c5839a000 R08: 00007f7c58399010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] [1] https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com Link: https://lkml.kernel.org/r/20250414145945.84916-3-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Suggested-by: Andrew Morton Suggested-by: Masami Hiramatsu (Google) Reviewed-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- include/linux/semaphore.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 04655faadc2d..89706157e622 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -16,13 +16,25 @@ struct semaphore { raw_spinlock_t lock; unsigned int count; struct list_head wait_list; + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + unsigned long last_holder; +#endif }; +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#define __LAST_HOLDER_SEMAPHORE_INITIALIZER \ + , .last_holder = 0UL +#else +#define __LAST_HOLDER_SEMAPHORE_INITIALIZER +#endif + #define __SEMAPHORE_INITIALIZER(name, n) \ { \ .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ - .wait_list = LIST_HEAD_INIT((name).wait_list), \ + .wait_list = LIST_HEAD_INIT((name).wait_list) \ + __LAST_HOLDER_SEMAPHORE_INITIALIZER \ } /* @@ -47,5 +59,6 @@ extern int __must_check down_killable(struct semaphore *sem); extern int __must_check down_trylock(struct semaphore *sem); extern int __must_check down_timeout(struct semaphore *sem, long jiffies); extern void up(struct semaphore *sem); +extern unsigned long sem_last_holder(struct semaphore *sem); #endif /* __LINUX_SEMAPHORE_H */ -- cgit v1.2.3 From 8d1d4b538bb12a858fa95a073c4aff31e79088ee Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 16 Apr 2025 10:06:13 -0600 Subject: scatterlist: inline sg_next() sg_next() is a short function called frequently in I/O paths. Define it in the header file so it can be inlined into its callers. Link: https://lkml.kernel.org/r/20250416160615.3571958-1-csander@purestorage.com Signed-off-by: Caleb Sander Mateos Cc: Eric Biggers Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 138e2f1bd08f..0cdbfc42f153 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -94,6 +94,28 @@ static inline bool sg_is_last(struct scatterlist *sg) return __sg_flags(sg) & SG_END; } +/** + * sg_next - return the next scatterlist entry in a list + * @sg: The current sg entry + * + * Description: + * Usually the next entry will be @sg@ + 1, but if this sg element is part + * of a chained scatterlist, it could jump to the start of a new + * scatterlist array. + * + **/ +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ + if (sg_is_last(sg)) + return NULL; + + sg++; + if (unlikely(sg_is_chain(sg))) + sg = sg_chain_ptr(sg); + + return sg; +} + /** * sg_assign_page - Assign a given page to an SG entry * @sg: SG entry @@ -418,7 +440,6 @@ static inline void sg_init_marker(struct scatterlist *sgl, int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); -struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); void sg_init_one(struct scatterlist *, const void *, unsigned int); -- cgit v1.2.3 From ba8182d44b4e557e2dc34e51a88105ce5b083372 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 19 Apr 2025 21:30:12 +0100 Subject: rapidio: remove unused functions rio_request_dma() and rio_dma_prep_slave_sg() were added in 2012 by commit e42d98ebe7d7 ("rapidio: add DMA engine support for RIO data transfers") but never used. rio_find_mport() last use was removed in 2013 by commit 9edbc30b434f ("rapidio: update enumerator registration mechanism") rio_unregister_scan() was added in 2013 by commit a11650e11093 ("rapidio: make enumeration/discovery configurable") but never used. Remove them. Link: https://lkml.kernel.org/r/20250419203012.429787-3-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Cc: Alexandre Bounine Cc: Matt Porter Signed-off-by: Andrew Morton --- include/linux/rio_drv.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index e49c32b0f394..dd8afe511242 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -391,13 +391,8 @@ struct rio_dev *rio_dev_get(struct rio_dev *); void rio_dev_put(struct rio_dev *); #ifdef CONFIG_RAPIDIO_DMA_ENGINE -extern struct dma_chan *rio_request_dma(struct rio_dev *rdev); extern struct dma_chan *rio_request_mport_dma(struct rio_mport *mport); extern void rio_release_dma(struct dma_chan *dchan); -extern struct dma_async_tx_descriptor *rio_dma_prep_slave_sg( - struct rio_dev *rdev, struct dma_chan *dchan, - struct rio_dma_data *data, - enum dma_transfer_direction direction, unsigned long flags); extern struct dma_async_tx_descriptor *rio_dma_prep_xfer( struct dma_chan *dchan, u16 destid, struct rio_dma_data *data, -- cgit v1.2.3 From 2a1c6158131f05c228001e1f73304535bc205444 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 19 Apr 2025 00:49:32 +0100 Subject: relay: remove unused relay_late_setup_files The last use of relay_late_setup_files() was removed in 2018 by commit 2b47733045aa ("drm/i915/guc: Merge log relay file and channel creation") Remove it and the helper it used. relay_late_setup_files() was used for eventually registering 'buffer only' channels. With it gone, delete the docs that explain how to do that. Which suggests it should be possible to lose the 'has_base_filename' flags. (Are there any other uses??) Link: https://lkml.kernel.org/r/20250418234932.490863-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jens Axboe Cc: Al Viro Cc: Andriy Shevchenko Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/relay.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/relay.h b/include/linux/relay.h index 72b876dd5cb8..b3224111d074 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -159,9 +159,6 @@ struct rchan *relay_open(const char *base_filename, size_t n_subbufs, const struct rchan_callbacks *cb, void *private_data); -extern int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent); extern void relay_close(struct rchan *chan); extern void relay_flush(struct rchan *chan); extern void relay_subbufs_consumed(struct rchan *chan, -- cgit v1.2.3 From 2e27fa943b74c9fc5fdf93090508c3c722531c66 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Mon, 21 Apr 2025 15:49:10 +0800 Subject: treewide: fix typo "previlege" There are some spelling mistakes of 'previlege' in comments which should be 'privilege'. Fix them and add it to scripts/spelling.txt. The typo in arch/loongarch/kvm/main.c was corrected by a different patch [1] and is therefore not included in this submission. [1]. https://lore.kernel.org/all/20250420142208.2252280-1-wheatfox17@icloud.com/ Link: https://lkml.kernel.org/r/46AD404E411A4BAC+20250421074910.66988-1-wangyuli@uniontech.com Signed-off-by: WangYuli Signed-off-by: Andrew Morton --- include/linux/habanalabs/hl_boot_if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h index d2a9fc96424b..af5fb4ad77eb 100644 --- a/include/linux/habanalabs/hl_boot_if.h +++ b/include/linux/habanalabs/hl_boot_if.h @@ -295,7 +295,7 @@ enum cpu_boot_dev_sts { * Initialized in: linux * * CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN GIC access permission only from - * previleged entity. FW sets this status + * privileged entity. FW sets this status * bit for host. If this bit is set then * GIC can not be accessed from host. * Initialized in: linux -- cgit v1.2.3 From 9c7b53b21fb1df20f85e3822160b4687f98ff7b3 Mon Sep 17 00:00:00 2001 From: Marc Herbert Date: Thu, 24 Apr 2025 19:40:35 +0000 Subject: compiler_types.h: fix "unused variable" in __compiletime_assert() This refines commit c03567a8e8d5 ("include/linux/compiler.h: don't perform compiletime_assert with -O0") and restores #ifdef __OPTIMIZE__ symmetry by evaluating the 'condition' variable in both compile-time variants of __compiletimeassert(). As __OPTIMIZE__ is always true by default, this commit does not change anything by default. But it fixes warnings with _non-default_ CFLAGS like for instance this: make CFLAGS_tcp.o='-Og -U__OPTIMIZE__' from net/ipv4/tcp.c:273: include/net/sch_generic.h: In function `qdisc_cb_private_validate': include/net/sch_generic.h:511:30: error: unused variable `qcb' [-Werror=unused-variable] { struct qdisc_skb_cb *qcb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb)); ... } [akpm@linux-foundation.org: regularize comment layout, reflow comment] Link: https://lkml.kernel.org/r/20250424194048.652571-1-marc.herbert@linux.intel.com Signed-off-by: Marc Herbert Cc: Alexander Potapenko Cc: Changbin Du Cc: Jan Hendrik Farr Cc: Macro Elver Cc: Miguel Ojeda Cc: Tony Ambardar Cc: Uros Bizjak Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/compiler_types.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 501cffddc2f4..1f2eee8331d3 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -525,6 +525,12 @@ struct ftrace_likely_data { sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) #ifdef __OPTIMIZE__ +/* + * #ifdef __OPTIMIZE__ is only a good approximation; for instance "make + * CFLAGS_foo.o=-Og" defines __OPTIMIZE__, does not elide the conditional code + * and can break compilation with wrong error message(s). Combine with + * -U__OPTIMIZE__ when needed. + */ # define __compiletime_assert(condition, msg, prefix, suffix) \ do { \ /* \ @@ -538,7 +544,7 @@ struct ftrace_likely_data { prefix ## suffix(); \ } while (0) #else -# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0) +# define __compiletime_assert(condition, msg, prefix, suffix) ((void)(condition)) #endif #define _compiletime_assert(condition, msg, prefix, suffix) \ -- cgit v1.2.3 From f3def8270c67217efb0e23dcd54714f74de3b6b2 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sun, 27 Apr 2025 23:14:49 +0300 Subject: sort.h: hoist cmp_int() into generic header file Deduplicate the same functionality implemented in several places by moving the cmp_int() helper macro into linux/sort.h. The macro performs a three-way comparison of the arguments mostly useful in different sorting strategies and algorithms. Link: https://lkml.kernel.org/r/20250427201451.900730-1-pchelkin@ispras.ru Signed-off-by: Fedor Pchelkin Suggested-by: Darrick J. Wong Acked-by: Kent Overstreet Acked-by: Coly Li Cc: Al Viro Cc: Carlos Maiolino Cc: Christian Brauner Cc: Coly Li Cc: Jan Kara Signed-off-by: Andrew Morton --- include/linux/sort.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sort.h b/include/linux/sort.h index 8e5603b10941..c01ef804a0eb 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -4,6 +4,16 @@ #include +/** + * cmp_int - perform a three-way comparison of the arguments + * @l: the left argument + * @r: the right argument + * + * Return: 1 if the left argument is greater than the right one; 0 if the + * arguments are equal; -1 if the left argument is less than the right one. + */ +#define cmp_int(l, r) (((l) > (r)) - ((l) < (r))) + void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, -- cgit v1.2.3 From c91d78622e16f628176d7248044106b03818af6d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 28 Apr 2025 10:27:37 +0300 Subject: util_macros.h: fix the reference in kernel-doc In PTR_IF() description the text refers to the parameter as (ptr) while the kernel-doc format asks for @ptr. Fix this accordingly. Link: https://lkml.kernel.org/r/20250428072737.3265239-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alexandru Ardelean Signed-off-by: Andrew Morton --- include/linux/util_macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 7b64fb597f85..8183ac610cc5 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -85,7 +85,7 @@ * @ptr: A pointer to assign if @cond is true. * * PTR_IF(IS_ENABLED(CONFIG_FOO), ptr) evaluates to @ptr if CONFIG_FOO is set - * to 'y' or 'm', or to NULL otherwise. The (ptr) argument must be a pointer. + * to 'y' or 'm', or to NULL otherwise. The @ptr argument must be a pointer. * * The macro can be very useful to help compiler dropping dead code. * -- cgit v1.2.3 From 479d26ee013c5388ad6855e512ab20d81e43750d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 1 May 2025 02:05:02 +0100 Subject: lib/oid_registry.c: remove unused sprint_OID sprint_OID() was added as part of 2012's commit 4f73175d0375 ("X.509: Add utility functions to render OIDs as strings") but it hasn't been used. Remove it. Note that there's also 'sprint_oid' (lower case) which is used in a lot of places; that's left as is except for fixing its case in a comment. Link: https://lkml.kernel.org/r/20250501010502.326472-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Andrew Morton --- include/linux/oid_registry.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h index 6f9242259edc..6de479ebbe5d 100644 --- a/include/linux/oid_registry.h +++ b/include/linux/oid_registry.h @@ -151,6 +151,5 @@ enum OID { extern enum OID look_up_OID(const void *data, size_t datasize); extern int parse_OID(const void *data, size_t datasize, enum OID *oid); extern int sprint_oid(const void *, size_t, char *, size_t); -extern int sprint_OID(enum OID, char *, size_t); #endif /* _LINUX_OID_REGISTRY_H */ -- cgit v1.2.3 From cf80fdbc0a5512feabbc56280b2abbb51d4e5b4e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 May 2025 15:17:42 +0300 Subject: list: remove redundant 'extern' for function prototypes The 'extern' keyword is redundant for function prototypes. list.h never used them and new code in general is better without them. Link: https://lkml.kernel.org/r/20250502121742.3997529-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton --- include/linux/list.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 29a375889fb8..e7e28afd28f8 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -50,9 +50,9 @@ static inline void INIT_LIST_HEAD(struct list_head *list) * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ -extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, - struct list_head *prev, - struct list_head *next); +bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, + struct list_head *prev, + struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a @@ -93,7 +93,7 @@ static __always_inline bool __list_add_valid(struct list_head *new, * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ -extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); +bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a -- cgit v1.2.3 From 285e871884ff3dc31c0c2c1a87f0018481bc8471 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 28 Apr 2025 12:22:16 +0300 Subject: mm/hmm: let users to tag specific PFN with DMA mapped bit Introduce new sticky flag (HMM_PFN_DMA_MAPPED), which isn't overwritten by HMM range fault. Such flag allows users to tag specific PFNs with information if this specific PFN was already DMA mapped. Tested-by: Jens Axboe Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- include/linux/hmm.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 126a36571667..a43e56f273a1 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -23,6 +23,8 @@ struct mmu_interval_notifier; * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID) * HMM_PFN_ERROR - accessing the pfn is impossible and the device should * fail. ie poisoned memory, special pages, no vma, etc + * HMM_PFN_DMA_MAPPED - Flag preserved on input-to-output transformation + * to mark that page is already DMA mapped * * On input: * 0 - Return the current state of the page, do not fault it. @@ -36,13 +38,19 @@ enum hmm_pfn_flags { HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8), + /* + * Sticky flags, carried from input to output, + * don't forget to update HMM_PFN_INOUT_FLAGS + */ + HMM_PFN_DMA_MAPPED = 1UL << (BITS_PER_LONG - 4), + + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 9), /* Input flags */ HMM_PFN_REQ_FAULT = HMM_PFN_VALID, HMM_PFN_REQ_WRITE = HMM_PFN_WRITE, - HMM_PFN_FLAGS = 0xFFUL << HMM_PFN_ORDER_SHIFT, + HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1), }; /* @@ -57,6 +65,14 @@ static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn) return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS); } +/* + * hmm_pfn_to_phys() - return physical address pointed to by a device entry + */ +static inline phys_addr_t hmm_pfn_to_phys(unsigned long hmm_pfn) +{ + return __pfn_to_phys(hmm_pfn & ~HMM_PFN_FLAGS); +} + /* * hmm_pfn_to_map_order() - return the CPU mapping size order * -- cgit v1.2.3 From 8cad47130566123b2c70ef2aa53be02ef1aee5e5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 28 Apr 2025 12:22:17 +0300 Subject: mm/hmm: provide generic DMA managing logic HMM callers use PFN list to populate range while calling to hmm_range_fault(), the conversion from PFN to DMA address is done by the callers with help of another DMA list. However, it is wasteful on any modern platform and by doing the right logic, that DMA list can be avoided. Provide generic logic to manage these lists and gave an interface to map/unmap PFNs to DMA addresses, without requiring from the callers to be an experts in DMA core API. Tested-by: Jens Axboe Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- include/linux/hmm-dma.h | 33 +++++++++++++++++++++++++++++++++ include/linux/hmm.h | 6 +++++- 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 include/linux/hmm-dma.h (limited to 'include/linux') diff --git a/include/linux/hmm-dma.h b/include/linux/hmm-dma.h new file mode 100644 index 000000000000..f58b9fc71999 --- /dev/null +++ b/include/linux/hmm-dma.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) 2024 NVIDIA Corporation & Affiliates */ +#ifndef LINUX_HMM_DMA_H +#define LINUX_HMM_DMA_H + +#include + +struct dma_iova_state; +struct pci_p2pdma_map_state; + +/* + * struct hmm_dma_map - array of PFNs and DMA addresses + * + * @state: DMA IOVA state + * @pfns: array of PFNs + * @dma_list: array of DMA addresses + * @dma_entry_size: size of each DMA entry in the array + */ +struct hmm_dma_map { + struct dma_iova_state state; + unsigned long *pfn_list; + dma_addr_t *dma_list; + size_t dma_entry_size; +}; + +int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map, + size_t nr_entries, size_t dma_entry_size); +void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map); +dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, + size_t idx, + struct pci_p2pdma_map_state *p2pdma_state); +bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx); +#endif /* LINUX_HMM_DMA_H */ diff --git a/include/linux/hmm.h b/include/linux/hmm.h index a43e56f273a1..db75ffc949a7 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -23,6 +23,8 @@ struct mmu_interval_notifier; * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID) * HMM_PFN_ERROR - accessing the pfn is impossible and the device should * fail. ie poisoned memory, special pages, no vma, etc + * HMM_PFN_P2PDMA - P2P page + * HMM_PFN_P2PDMA_BUS - Bus mapped P2P transfer * HMM_PFN_DMA_MAPPED - Flag preserved on input-to-output transformation * to mark that page is already DMA mapped * @@ -43,8 +45,10 @@ enum hmm_pfn_flags { * don't forget to update HMM_PFN_INOUT_FLAGS */ HMM_PFN_DMA_MAPPED = 1UL << (BITS_PER_LONG - 4), + HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5), + HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6), - HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 9), + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11), /* Input flags */ HMM_PFN_REQ_FAULT = HMM_PFN_VALID, -- cgit v1.2.3 From f4856c20c137a73d73e448caa3964098024248bf Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Tue, 29 Apr 2025 02:36:03 +0200 Subject: power: supply: core: Add additional health status values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some batteries can signal when an internal fuse was blown. In such a case POWER_SUPPLY_HEALTH_DEAD is too vague for userspace applications to perform meaningful diagnostics. Additionally some batteries can also signal when some of their internal cells are imbalanced. In such a case returning POWER_SUPPLY_HEALTH_UNSPEC_FAILURE is again too vague for userspace applications to perform meaningful diagnostics. Add new health status values for both cases. Signed-off-by: Armin Wolf Reviewed-by: Sebastian Reichel Link: https://lore.kernel.org/r/20250429003606.303870-1-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/power_supply.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index cbec930430a7..03786c8c2efe 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -71,6 +71,8 @@ enum { POWER_SUPPLY_HEALTH_COOL, POWER_SUPPLY_HEALTH_HOT, POWER_SUPPLY_HEALTH_NO_BATTERY, + POWER_SUPPLY_HEALTH_BLOWN_FUSE, + POWER_SUPPLY_HEALTH_CELL_IMBALANCE, }; enum { -- cgit v1.2.3 From 6157e62b07d9331cc1d4d9d525dab33d45b0e83c Mon Sep 17 00:00:00 2001 From: Paul Geurts Date: Mon, 5 May 2025 13:59:36 +0200 Subject: regulator: pca9450: Add restart handler When restarting a CPU powered by the PCA9450 power management IC, it is beneficial to use the PCA9450 to power cycle the CPU and all its connected peripherals to start up in a known state. The PCA9450 features a cold start procedure initiated by an I2C command. Add a restart handler so that the PCA9450 is used to restart the CPU. The restart handler sends command 0x14 to the SW_RST register, initiating a cold reset (Power recycle all regulators except LDO1, LDO2 and CLK_32K_OUT) As the PCA9450 is a PMIC specific for the i.MX8M family CPU, the restart handler priority is set just slightly higher than imx2_wdt and the PSCI restart handler. This makes sure this restart handler takes precedence. Signed-off-by: Paul Geurts Link: https://patch.msgid.link/20250505115936.1946891-1-paul.geurts@prodrive-technologies.com Signed-off-by: Mark Brown --- include/linux/regulator/pca9450.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index b427b5873de1..85b4fecc10d8 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -35,6 +35,8 @@ enum { PCA9450_DVS_LEVEL_MAX, }; +#define PCA9450_RESTART_HANDLER_PRIORITY 130 + #define PCA9450_BUCK1_VOLTAGE_NUM 0x80 #define PCA9450_BUCK2_VOLTAGE_NUM 0x80 #define PCA9450_BUCK3_VOLTAGE_NUM 0x80 @@ -235,4 +237,7 @@ enum { #define I2C_LT_ON_RUN 0x02 #define I2C_LT_FORCE_ENABLE 0x03 +/* PCA9450_REG_SW_RST command */ +#define SW_RST_COMMAND 0x14 + #endif /* __LINUX_REG_PCA9450_H__ */ -- cgit v1.2.3 From b9020bdb9f76b58117f9902db3047693fd12b11b Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 5 May 2025 10:38:17 -0700 Subject: ACPI: MRRM: Minimal parse of ACPI MRRM table The resctrl file system code needs to know how many region tags are supported. Parse the ACPI MRRM table and save the max_mem_region value. Provide a function for resctrl to collect that value. Signed-off-by: Tony Luck Link: https://patch.msgid.link/20250505173819.419271-2-tony.luck@intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3f2e93ed9730..c409f4cecb09 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -772,6 +772,10 @@ int acpi_get_local_u64_address(acpi_handle handle, u64 *addr); int acpi_get_local_address(acpi_handle handle, u32 *addr); const char *acpi_get_subsystem_id(acpi_handle handle); +#ifdef CONFIG_ACPI_MRRM +int acpi_mrrm_max_mem_region(void); +#endif + #else /* !CONFIG_ACPI */ #define acpi_disabled 1 @@ -1092,6 +1096,11 @@ static inline acpi_handle acpi_get_processor_handle(int cpu) return NULL; } +static inline int acpi_mrrm_max_mem_region(void) +{ + return -ENOENT; +} + #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HMAT -- cgit v1.2.3 From 8fb7aee05591fd4d3dca1460448a59e95fa821c3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 9 May 2025 12:12:54 +0100 Subject: io_uring: drain based on allocates reqs Don't rely on CQ sequence numbers for draining, as it has become messy and needs cq_extra adjustments. Instead, base it on the number of allocated requests and only allow flushing when all requests are in the drain list. As a result, cq_extra is gone, no overhead for its accounting in aux cqe posting, less bloating as it was inlined before, and it's in general simpler than trying to track where we should bump it and where it should be put back like in cases of overflow. Also, it'll likely help with cleaning and unifying some of the CQ posting helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46ece1e34320b046c06fee2498d6b4cd12a700f2.1746788718.git.asml.silence@gmail.com Link: https://lore.kernel.org/r/24497b04b004bceada496033d3c9d09ff8e81ae9.1746944903.git.asml.silence@gmail.com [axboe: fold in fix from link2] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 73b289b48280..00dbd7cd0e7d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -341,7 +341,6 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; - unsigned cq_extra; void *cq_wait_arg; size_t cq_wait_size; @@ -417,6 +416,7 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; + unsigned nr_drained; struct io_alloc_cache msg_cache; spinlock_t msg_lock; -- cgit v1.2.3 From 2c04e58e30ce858cc2be531298312c67c7d55fc3 Mon Sep 17 00:00:00 2001 From: Melody Olvera Date: Mon, 12 May 2025 13:54:43 -0700 Subject: soc: qcom: llcc-qcom: Add support for SM8750 Add system cache table and configs for SM8750 SoCs. Signed-off-by: Melody Olvera Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250512-sm8750_llcc_master-v5-3-d78dca6282a5@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/llcc-qcom.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 8e5d78fb4847..7a69210a250c 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -24,6 +24,7 @@ #define LLCC_CMPTDMA 15 #define LLCC_DISP 16 #define LLCC_VIDFW 17 +#define LLCC_CAMFW 18 #define LLCC_MDMHPFX 20 #define LLCC_MDMPNG 21 #define LLCC_AUDHW 22 @@ -67,6 +68,13 @@ #define LLCC_EVCS_LEFT 67 #define LLCC_EVCS_RIGHT 68 #define LLCC_SPAD 69 +#define LLCC_VIDDEC 70 +#define LLCC_CAMOFE 71 +#define LLCC_CAMRTIP 72 +#define LLCC_CAMSRTIP 73 +#define LLCC_CAMRTRF 74 +#define LLCC_CAMSRTRF 75 +#define LLCC_CPUSSMPAM 89 /** * struct llcc_slice_desc - Cache slice descriptor -- cgit v1.2.3 From d060b6aab031b6113f78cd3d1585115f13386eec Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 12 May 2025 21:53:46 +0100 Subject: helpers: make few bpf helpers public Make bpf_dynptr_slice_rdwr, bpf_dynptr_check_off_len and __bpf_dynptr_write available outside of the helpers.c by adding their prototypes into linux/include/bpf.h. bpf_dynptr_check_off_len() implementation is moved to header and made inline explicitly, as small function should typically be inlined. These functions are going to be used from bpf_trace.c in the next patch of this series. Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20250512205348.191079-2-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3f0cc89c0622..83c56f40842b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1349,6 +1349,20 @@ u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, + void *src, u32 len, u64 flags); +void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, + void *buffer__opt, u32 buffer__szk); + +static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +{ + u32 size = __bpf_dynptr_size(ptr); + + if (len > size || offset > size - len) + return -E2BIG; + + return 0; +} #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, -- cgit v1.2.3 From e8007fad5457ea547ca63bb011fdb03213571c7e Mon Sep 17 00:00:00 2001 From: Steve Siwinski Date: Thu, 8 May 2025 16:01:22 -0400 Subject: scsi: sd_zbc: block: Respect bio vector limits for REPORT ZONES buffer The REPORT ZONES buffer size is currently limited by the HBA's maximum segment count to ensure the buffer can be mapped. However, the block layer further limits the number of iovec entries to 1024 when allocating a bio. To avoid allocation of buffers too large to be mapped, further restrict the maximum buffer size to BIO_MAX_INLINE_VECS. Replace the UIO_MAXIOV symbolic name with the more contextually appropriate BIO_MAX_INLINE_VECS. Fixes: b091ac616846 ("sd_zbc: Fix report zones buffer allocation") Cc: stable@vger.kernel.org Signed-off-by: Steve Siwinski Link: https://lore.kernel.org/r/20250508200122.243129-1-ssiwinski@atto.com Reviewed-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index cafc7c215de8..b786ec5bcc81 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -11,6 +11,7 @@ #include #define BIO_MAX_VECS 256U +#define BIO_MAX_INLINE_VECS UIO_MAXIOV struct queue_limits; -- cgit v1.2.3 From 0e1c773b501f33437d87b72c7d26080361e224b1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 20 Apr 2025 12:40:24 -0700 Subject: mm/damon/core: introduce damos quota goal metrics for memory node utilization Patch series "mm/damon: auto-tune DAMOS for NUMA setups including tiered memory". Utilizing DAMON for memory tiering usually requires manual tuning and/or tedious controls. Let it self-tune hotness and coldness thresholds for promotion and demotion aiming high utilization of high memory tiers, by introducing new DAMOS quota goal metrics representing the used and the free memory ratios of specific NUMA nodes. And introduce a sample DAMON module that demonstrates how the new feature can be used for memory tiering use cases. Backgrounds =========== A type of tiered memory system exposes the memory tiers as NUMA nodes. A straightforward pages placement strategy for such systems is placing access-hot and cold pages on upper and lower tiers, reespectively, pursuing higher utilization of upper tiers. Since access temperature can be dynamic, periodically finding and migrating hot pages and cold pages to proper tiers (promoting and demoting) is also required. Linux kernel provides several features for such dynamic and transparent pages placement. Page Faults and LRU ------------------- One widely known way is using NUMA balancing in tiering mode (a.k.a NUMAB-2) and reclaim-based demotion features. In the setup, NUMAB-2 finds hot pages using access check-purpose page faults (a.k.a prot_none) and promote those inside each process' context, until there is no more pages to promote, or the upper tier is filled up and memory pressure happens. In the latter case, LRU-based reclaim logic wakes up as a response to the memory pressure and demotes cold pages to lower tiers in asynchronous (kswapd) and/or synchronous ways (direct reclaim). DAMON ----- Yet another available solution is using DAMOS with migrate_hot and migrate_cold DAMOS actions for promotions and demotions, respectively. To make it optimum, users need to specify aggressiveness and access temperature thresholds for promotions and demotions in a good balance that results in high utilization of upper tiers. The number of parameters is not small, and optimum parameter values depend on characteristics of the underlying hardware and the workload. As a result, it often requires manual, time consuming and repetitive tuning of the DAMOS schemes for given workloads and systems combinations. Self-tuned DAMON-based Memory Tiering ===================================== To solve such manual tuning problems, DAMOS provides aim-oriented feedback-driven quotas self-tuning. Using the feature, we design a self-tuned DAMON-based memory tiering for general multi-tier memory systems. For each memory tier node, if it has a lower tier, run a DAMOS scheme that demotes cold pages of the node, auto-tuning the aggressiveness aiming an amount of free space of the node. The free space is for keeping the headroom that avoids significant memory pressure during upper tier memory usage spike, and promoting hot pages from the lower tier. For each memory tier node, if it has an upper tier, run a DAMOS scheme that promotes hot pages of the current node to the upper tier, auto-tuning the aggressiveness aiming a high utilization ratio of the upper tier. The target ratio is to ensure higher tiers are utilized as much as possible. It should match with the headroom for demotion scheme, but have slight overlap, to ensure promotion and demotion are not entirely stopped. The aim-oriented aggressiveness auto-tuning of DAMOS is already available. Hence, to make such tiering solution implementation, only new quota goal metrics for utilization and free space ratio of specific NUMA node need to be developed. Discussions =========== The design imposes below discussion points. Expected Behaviors ------------------ The system will let upper tier memory node accommodates as many hot data as possible. If total amount of the data is less than the top tier memory's promotion/demotion target utilization, entire data will be just placed on the top tier. Promotion scheme will do nothing since there is no data to promote. Demotion scheme will also do nothing since the free space ratio of the top tier is higher than the goal. Only if the amount of data is larger than the top tier's utilization ratio, demotion scheme will demote cold pages and ensure the headroom free space. Since the promotion and demotion schemes for a single node has small overlap at their target utilization and free space goals, promotions and demotions will continue working with a moderate aggressiveness level. It will keep all data is placed on access hotness under dynamic access pattern, while minimizing the migration overhead. In any case, each node will keep headroom free space and as many upper tiers are utilized as possible. Ease of Use ----------- Users still need to set the target utilization and free space ratio, but it will be easier to set. We argue 99.7 % utilization and 0.5 % free space ratios can be good default values. It can be easily adjusted based on desired headroom size of given use case. Users are also still required to answer the minimum coldness and hotness thresholds. Together with monitoring intervals auto-tuning[2], DAMON will always show meaningful amount of hot and cold memory. And DAMOS quota's prioritization mechanism will make good decision as long as the source information is that colorful. Hence, users can very naively set the minimum criterias. We believe any access observation and no access observation within last one aggregation interval is enough for minimum hot and cold regions criterias. General Tiered Memory Setup Applicability ----------------------------------------- The design can be applied to any number of tiers having any performance characteristics, as long as they can be hierarchical. Hence, applying the system to different tiered memory system will be straightforward. Note that this assumes only single CPU NUMA node case. Because today's DAMON is not aware of which CPU made each access, applying this on systems having multiple CPU NUMA nodes can be complicated. We are planning to extend DAMON for the use case, but that's out of the scope of this patch series. How To Use ---------- Users can implement the auto-tuned DAMON-based memory tiering using DAMON sysfs interface. It can be easily done using DAMON user-space tool like user-space tool. Below evaluation results section shows an example DAMON user-space tool command for that. For wider and simpler deployment, having a kernel module that sets up and run the DAMOS schemes via DAMON kernel API can be useful. The module can enable the memory tiering at boot time via kernel command line parameter or at run time with single command. This patch series implements a sample DAMON kernel module that shows how such module can be implemented. Comparison To Page Faults and LRU-based Approaches -------------------------------------------------- The existing page faults based promotion (NUMAB-2) does hot pages detection and migration in the process context. When there are many pages to promote, it can block the progress of the application's real works. DAMOS works in asynchronous worker thread, so it doesn't block the real works. NUMAB-2 doesn't provide a way to control aggressiveness of promotion other than the maximum amount of pages to promote per given time widnow. If hot pages are found, promotions can happen in the upper-bound speed, regardless of upper tier's memory pressure. If the maximum speed is not well set for the given workload, it can result in slow promotion or unnecessary memory pressure. Self-tuned DAMON-based memory tiering alleviates the problem by adjusting the speed based on current utilization of the upper tier. LRU-based demotion can be triggered in both asynchronous (kswapd) and synchronous (direct reclaim) ways. Other than the way of finding cold pages, asynchronous LRU-based demotion and DAMON-based demotion has no big difference. DAMON-based demotion can make a better balancing with DAMON-based promotion, though. The LRU-based demotion can do better than DAMON-based demotion when the tier is having significant memory pressure. It would be wise to use DAMON-based demotion as a proactive and primary one, but utilizing LRU-based demotions together as a fast backup solution. Evaluation ========== In short, under a setup that requires fast and frequent promotions, self-tuned DAMON-based memory tiering's hot pages promotion improves performance about 4.42 %. We believe this shows self-tuned DAMON-based promotion's effectiveness. Meanwhile, NUMAB-2's hot pages promotion degrades the performance about 7.34 %. We suspect the degradation is mostly due to NUMAB-2's synchronous nature that can block the application's progress, which highlights the advantage of DAMON-based solution's asynchronous nature. Note that the test was done with the RFC version of this patch series. We don't run it again since this patch series got no meaningful change after the RFC, while the test takes pretty long time. Setup ----- Hardware. Use a machine that equips 250 GiB DRAM memory tier and 50 GiB CXL memory tier. The tiers are exposed as NUMA nodes 0 and 1, respectively. Kernel. Use Linux kernel v6.13 that modified as following. Add all DAMON patches that available on mm tree of 2025-03-15, and this patch series. Also modify it to ignore mempolicy() system calls, to avoid bad effects from application's traditional NUMA systems assumed optimizations. Workload. Use a modified version of Taobench benchmark[3] that available on DCPerf benchmark suite. It represents an in-memory caching workload. We set its 'memsize', 'warmup_time', and 'test_time' parameter as 340 GiB, 2,500 seconds and 1,440 seconds. The parameters are chosen to ensure the workload uses more than DRAM memory tier. Its RSS under the parameter grows to 270 GiB within the warmup time. It turned out the workload has a very static access pattrn. Only about 13 % of the RSS is frequently accessed from the beginning to end. Hence promotion shows no meaningful performance difference regardless of different design and implementations. We therefore modify the kernel to periodically demote up to 10 GiB hot pages and promote up to 10 GiB cold pages once per minute. The intention is to simulate periodic access pattern changes. The hotness and coldness threshold is very naively set so that it is more like random access pattern change rather than strict hot/cold pages exchange. This is why we call the workload as "modified". It is implemented as two DAMOS schemes each running on an asynchronous thread. It can be reproduced with DAMON user-space tool like below. # ./damo start \ --ops paddr --numa_node 0 --monitoring_intervals 10s 200s 200s \ --damos_action migrate_hot 1 \ --damos_quota_interval 60s --damos_quota_space 10G \ --ops paddr --numa_node 1 --monitoring_intervals 10s 200s 200s \ --damos_action migrate_cold 0 \ --damos_quota_interval 60s --damos_quota_space 10G \ --nr_schemes 1 1 --nr_targets 1 1 --nr_ctxs 1 1 System configurations. Use below variant system configurations. - Baseline. No memory tiering features are turned on. - Numab_tiering. On the baseline, enable NUMAB-2 and relcaim-based demotion. In detail, following command is executed: echo 2 > /proc/sys/kernel/numa_balancing; echo 1 > /sys/kernel/mm/numa/demotion_enabled; echo 7 > /proc/sys/vm/zone_reclaim_mode - DAMON_tiering. On the baseline, utilize self-tuned DAMON-based memory tiering implementation via DAMON user-space tool. It utilizes two kernel threads, namely promotion thread and demotion thread. Demotion thread monitors access pattern of DRAM node using DAMON with auto-tuned monitoring intervals aiming 4% DAMON-observed access ratio, and demote coldest pages up to 200 MiB per second aiming 0.5% free space of DRAM node. Promotion thread monitors CXL node using same intervals auto-tuning, and promote hot pages in same way but aiming for 99.7% utilization of DRAM node. Because DAMON provides only best-effort accuracy, add young page DAMOS filters to allow only and reject all young pages at promoting and demoting, respectively. It can be reproduced with DAMON user-space tool like below. # ./damo start \ --numa_node 0 --monitoring_intervals_goal 4% 3 5ms 10s \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_mem_free_bp 0.5% 0 \ --damos_filter reject young \ --numa_node 1 --monitoring_intervals_goal 4% 3 5ms 10s \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_mem_used_bp 99.7% 0 \ --damos_filter allow young \ --damos_nr_quota_goals 1 1 --damos_nr_filters 1 1 \ --nr_targets 1 1 --nr_schemes 1 1 --nr_ctxs 1 1 Measurment Results ------------------ On each system configuration, run the modified version of Taobench and collect 'score'. 'score' is a metric that calculated and provided by Taobench to represents the performance of the run on the system. To handle the measurement errors, repeat the measurement five times. The results are as below. Config Score Stdev (%) Normalized Baseline 1.6165 0.0319 1.9764 1.0000 Numab_tiering 1.4976 0.0452 3.0209 0.9264 DAMON_tiering 1.6881 0.0249 1.4767 1.0443 'Config' column shows the system config of the measurement. 'Score' column shows the 'score' measurement in average of the five runs on the system config. 'Stdev' column shows the standsard deviation of the five measurements of the scores. '(%)' column shows the 'Stdev' to 'Score' ratio in percentage. Finally, 'Normalized' column shows the averaged score values of the configs that normalized to that of 'Baseline'. The periodic hot pages demotion and cold pages promotion that was conducted to simulate dynamic access pattern was started from the beginning of the workload. It resulted in the DRAM tier utilization always under the watermark, and hence no real demotion was happened for all test runs. This means the above results show no difference between LRU-based and DAMON-based demotions. Only difference between NUMAB-2 and DAMON-based promotions are represented on the results. Numab_tiering config degraded the performance about 7.36 %. We suspect this happened because NUMAB-2's synchronous promotion was blocking the Taobench's real work progress. DAMON_tiering config improved the performance about 4.43 %. We believe this shows effectiveness of DAMON-based promotion that didn't block Taobench's real work progress due to its asynchronous nature. Also this means DAMON's monitoring results are accurate enough to provide visible amount of improvement. Evaluation Limitations ---------------------- As mentioned above, this evaluation shows only comparison of promotion mechanisms. DAMON-based tiering is recommended to be used together with reclaim-based demotion as a faster backup under significant memory pressure, though. From some perspective, the modified version of Taobench may seems making the picture distorted too much. It would be better to evaluate with more realistic workload, or more finely tuned micro benchmarks. Patch Sequence ============== The first patch (patch 1) implements two new quota goal metrics on core layer and expose it to DAMON core kernel API. The second and third ones (patches 2 and 3) further link it to DAMON sysfs interface. Three following patches (patches 4-6) document the new feature and sysfs file on design, usage, and ABI documents. The final one (patch 7) implements a working version of a self-tuned DAMON-based memory tiering solution in an incomplete but easy to understand form as a kernel module under samples/damon/ directory. References ========== [1] https://lore.kernel.org/20231112195602.61525-1-sj@kernel.org/ [2] https://lore.kernel.org/20250303221726.484227-1-sj@kernel.org [3] https://github.com/facebookresearch/DCPerf/blob/main/packages/tao_bench/README.md This patch (of 7): Used and free space ratios for specific NUMA nodes can be useful inputs for NUMA-specific DAMOS schemes' aggressiveness self-tuning feedback loop. Implement DAMOS quota goal metrics for such self-tuned schemes. Link: https://lkml.kernel.org/r/20250420194030.75838-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250420194030.75838-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Yunjeong Mun Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 47e36e6ea203..a4011726cb3b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -145,6 +145,8 @@ enum damos_action { * * @DAMOS_QUOTA_USER_INPUT: User-input value. * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. + * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. + * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -152,6 +154,8 @@ enum damos_action { enum damos_quota_goal_metric { DAMOS_QUOTA_USER_INPUT, DAMOS_QUOTA_SOME_MEM_PSI_US, + DAMOS_QUOTA_NODE_MEM_USED_BP, + DAMOS_QUOTA_NODE_MEM_FREE_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -161,6 +165,7 @@ enum damos_quota_goal_metric { * @target_value: Target value of @metric to achieve with the tuning. * @current_value: Current value of @metric. * @last_psi_total: Last measured total PSI + * @nid: Node id. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -179,6 +184,7 @@ struct damos_quota_goal { /* metric-dependent fields */ union { u64 last_psi_total; + int nid; }; struct list_head list; }; -- cgit v1.2.3 From 786d5cc2b92ac331d0654452c6c3cea611772e09 Mon Sep 17 00:00:00 2001 From: "Christoph Lameter (Ampere)" Date: Mon, 21 Apr 2025 13:58:06 -0700 Subject: Update Christoph's Email address and make it consistent Use cl@gentwo.org throughout and remove the old email addresses. Link: https://lkml.kernel.org/r/8b962f57-4d98-cbb0-cd82-b6ba456733e8@gentwo.org Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 0aeb0e276a3e..c16cdeaa505e 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -375,7 +375,7 @@ do { \ } while (0) /* - * this_cpu operations (C) 2008-2013 Christoph Lameter + * this_cpu operations (C) 2008-2013 Christoph Lameter * * Optimized manipulation for memory allocated through the per cpu * allocator or for addresses of per cpu variables. -- cgit v1.2.3 From 8adce0857769d596c5b000d118119e3bb1d63a32 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Thu, 24 Apr 2025 16:28:05 -0400 Subject: cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "vmscan: enforce mems_effective during demotion", v5. Change reclaim to respect cpuset.mems_effective during demotion when possible. Presently, reclaim explicitly ignores cpuset.mems_effective when demoting, which may cause the cpuset settings to violated. Implement cpuset_node_allowed() to check the cpuset.mems_effective associated wih the mem_cgroup of the lruvec being scanned. This only applies to cgroup/cpuset v2, as cpuset exists in a different hierarchy than mem_cgroup in v1. This requires renaming the existing cpuset_node_allowed() to be cpuset_current_now_allowed() - which is more descriptive anyway - to implement the new cpuset_node_allowed() which takes a target cgroup. This patch (of 2): Rename cpuset_node_allowed to reflect that the function checks the current task's cpuset.mems. This allows us to make a new cpuset_node_allowed function that checks a target cgroup's cpuset.mems. Link: https://lkml.kernel.org/r/20250424202806.52632-1-gourry@gourry.net Link: https://lkml.kernel.org/r/20250424202806.52632-2-gourry@gourry.net Signed-off-by: Gregory Price Acked-by: Waiman Long Acked-by: Tejun Heo Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/cpuset.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 5466c96a33db..d6a4fe5c3b6e 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -82,11 +82,11 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); -extern bool cpuset_node_allowed(int node, gfp_t gfp_mask); +extern bool cpuset_current_node_allowed(int node, gfp_t gfp_mask); static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return cpuset_node_allowed(zone_to_nid(z), gfp_mask); + return cpuset_current_node_allowed(zone_to_nid(z), gfp_mask); } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) -- cgit v1.2.3 From 7d709f49babc28907b0ac60228f522d2e6216add Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Thu, 24 Apr 2025 16:28:06 -0400 Subject: vmscan,cgroup: apply mems_effective to reclaim MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible for a reclaimer to cause demotions of an lruvec belonging to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply this limitation based on the lruvec's memcg and prevent demotion. Notably, this may still allow demotion of shared libraries or any memory first instantiated in another cgroup. This means cpusets still cannot cannot guarantee complete isolation when demotion is enabled, and the docs have been updated to reflect this. This is useful for isolating workloads on a multi-tenant system from certain classes of memory more consistently - with the noted exceptions. Note on locking: The cgroup_get_e_css reference protects the css->effective_mems, and calls of this interface would be subject to the same race conditions associated with a non-atomic access to cs->effective_mems. So while this interface cannot make strong guarantees of correctness, it can therefore avoid taking a global or rcu_read_lock for performance. Link: https://lkml.kernel.org/r/20250424202806.52632-3-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Shakeel Butt Suggested-by: Waiman Long Acked-by: Tejun Heo Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Waiman Long Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/cpuset.h | 5 +++++ include/linux/memcontrol.h | 7 +++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d6a4fe5c3b6e..2ddb256187b5 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -173,6 +173,7 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } +extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid); #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -293,6 +294,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } +static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +{ + return true; +} #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5264d148bdd9..ac1b003ee5b8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1736,6 +1736,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, rcu_read_unlock(); } +bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); + #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1797,6 +1799,11 @@ static inline ino_t page_cgroup_ino(struct page *page) { return 0; } + +static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) +{ + return true; +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) -- cgit v1.2.3 From b960818d51b3c8275d0f80c5fc4156eb2ff2fde6 Mon Sep 17 00:00:00 2001 From: Gavin Guo Date: Fri, 25 Apr 2025 18:38:59 +0800 Subject: mm/huge_memory: remove useless folio pointers passing Since the previous commit "mm/huge_memory: Adjust try_to_migrate_one() and split_huge_pmd_locked()" has simplified the logic by leveraging the folio verification in page_vma_mapped_walk(), this patch removes the unnecessary folio pointers passing. Link: https://lkml.kernel.org/r/20250425103859.825879-3-gavinguo@igalia.com Link: https://lore.kernel.org/all/98d1d195-7821-4627-b518-83103ade56c0@redhat.com/ Link: https://lore.kernel.org/all/91599a3c-e69e-4d79-bac5-5013c96203d7@redhat.com/ Signed-off-by: Gavin Guo Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Cc: Florent Revest Cc: Gavin Shan Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f190998b2ebd..2f190c90192d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -395,7 +395,7 @@ static inline int split_huge_page(struct page *page) void deferred_split_folio(struct folio *folio, bool partially_mapped); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct folio *folio); + unsigned long address, bool freeze); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ @@ -403,12 +403,11 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ - false, NULL); \ + false); \ } while (0) - void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, - bool freeze, struct folio *folio); + bool freeze); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); @@ -501,7 +500,7 @@ static inline bool thp_migration_supported(void) } void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, bool freeze, struct folio *folio); + pmd_t *pmd, bool freeze); bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio); @@ -576,12 +575,12 @@ static inline void deferred_split_folio(struct folio *folio, bool partially_mapp do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address, bool freeze, struct folio *folio) {} + unsigned long address, bool freeze) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address, bool freeze, struct folio *folio) {} + unsigned long address, bool freeze) {} static inline void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, - bool freeze, struct folio *folio) {} + bool freeze) {} static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, -- cgit v1.2.3 From bc9817bb7a21f64fbca2c4b83811d943036ec870 Mon Sep 17 00:00:00 2001 From: Huan Yang Date: Fri, 25 Apr 2025 11:19:23 +0800 Subject: mm/memcg: move mem_cgroup_init() ahead of cgroup_init() Patch series "Use kmem_cache for memcg alloc", v3. (willy tldr: "you've gone from allocating 8 objects per 32KiB to allocating 13 objects per 32KiB, a 62% improvement in memory consumption" [1]) The mem_cgroup_alloc function creates mem_cgroup struct and it's associated structures including mem_cgroup_per_node. Through detailed analysis on our test machine (Arm64, 16GB RAM, 6.6 kernel, 1 NUMA node, memcgv2 with nokmem,nosocket,cgroup_disable=pressure), we can observe the memory allocation for these structures using the following shell commands: # Enable tracing echo 1 > /sys/kernel/tracing/events/kmem/kmalloc/enable echo 1 > /sys/kernel/tracing/tracing_on cat /sys/kernel/tracing/trace_pipe | grep kmalloc | grep mem_cgroup # Trigger allocation if cgroup subtree do not enable memcg echo +memory > /sys/fs/cgroup/cgroup.subtree_control Ftrace Output: # mem_cgroup struct allocation sh-6312 [000] ..... 58015.698365: kmalloc: call_site=mem_cgroup_css_alloc+0xd8/0x5b4 ptr=000000003e4c3799 bytes_req=2312 bytes_alloc=4096 gfp_flags=GFP_KERNEL|__GFP_ZERO node=-1 accounted=false # mem_cgroup_per_node allocation sh-6312 [000] ..... 58015.698389: kmalloc: call_site=mem_cgroup_css_alloc+0x1d8/0x5b4 ptr=00000000d798700c bytes_req=2896 bytes_alloc=4096 gfp_flags=GFP_KERNEL|__GFP_ZERO node=0 accounted=false Key Observations: 1. Both structures use kmalloc with requested sizes between 2KB-4KB 2. Allocation alignment forces 4KB slab usage due to pre-defined sizes (64B, 128B,..., 2KB, 4KB, 8KB) 3. Memory waste per memcg instance: Base struct: 4096 - 2312 = 1784 bytes Per-node struct: 4096 - 2896 = 1200 bytes Total waste: 2984 bytes (1-node system) NUMA scaling: (1200 + 8) * nr_node_ids bytes So, it's a little waste. This patchset introduces dedicated kmem_cache: Patch2 - mem_cgroup kmem_cache - memcg_cachep Patch3 - mem_cgroup_per_node kmem_cache - memcg_pn_cachep The benefits of this change can be observed with the following tracing commands: # Enable tracing echo 1 > /sys/kernel/tracing/events/kmem/kmem_cache_alloc/enable echo 1 > /sys/kernel/tracing/tracing_on cat /sys/kernel/tracing/trace_pipe | grep kmem_cache_alloc | grep mem_cgroup # In another terminal: echo +memory > /sys/fs/cgroup/cgroup.subtree_control The output might now look like this: # mem_cgroup struct allocation sh-9827 [000] ..... 289.513598: kmem_cache_alloc: call_site=mem_cgroup_css_alloc+0xbc/0x5d4 ptr=00000000695c1806 bytes_req=2312 bytes_alloc=2368 gfp_flags=GFP_KERNEL|__GFP_ZERO node=-1 accounted=false # mem_cgroup_per_node allocation sh-9827 [000] ..... 289.513602: kmem_cache_alloc: call_site=mem_cgroup_css_alloc+0x1b8/0x5d4 ptr=000000002989e63a bytes_req=2896 bytes_alloc=2944 gfp_flags=GFP_KERNEL|__GFP_ZERO node=0 accounted=false This indicates that the `mem_cgroup` struct now requests 2312 bytes and is allocated 2368 bytes, while `mem_cgroup_per_node` requests 2896 bytes and is allocated 2944 bytes. The slight increase in allocated size is due to `SLAB_HWCACHE_ALIGN` in the `kmem_cache`. Without `SLAB_HWCACHE_ALIGN`, the allocation might appear as: # mem_cgroup struct allocation sh-9269 [003] ..... 80.396366: kmem_cache_alloc: call_site=mem_cgroup_css_alloc+0xbc/0x5d4 ptr=000000005b12b475 bytes_req=2312 bytes_alloc=2312 gfp_flags=GFP_KERNEL|__GFP_ZERO node=-1 accounted=false # mem_cgroup_per_node allocation sh-9269 [003] ..... 80.396411: kmem_cache_alloc: call_site=mem_cgroup_css_alloc+0x1b8/0x5d4 ptr=00000000f347adc6 bytes_req=2896 bytes_alloc=2896 gfp_flags=GFP_KERNEL|__GFP_ZERO node=0 accounted=false While the `bytes_alloc` now matches the `bytes_req`, this patchset defaults to using `SLAB_HWCACHE_ALIGN` as it is generally considered more beneficial for performance. Please let me know if there are any issues or if I've misunderstood anything. This patchset also move mem_cgroup_init ahead of cgroup_init() due to cgroup_init() will allocate root_mem_cgroup, but each initcall invoke after cgroup_init, so if each kmem_cache do not prepare, we need testing NULL before use it. This patch (of 3): When cgroup_init() creates root_mem_cgroup through css_alloc callback, some critical resources might not be fully initialized, forcing later operations to perform conditional checks for resource availability. This patch move mem_cgroup_init() to address the init order, it invoke before cgroup_init, so, compare to subsys_initcall, it can use to prepare some key resources before root_mem_cgroup alloc. Link: https://lkml.kernel.org/r/aAsRCj-niMMTtmK8@casper.infradead.org [1] Link: https://lkml.kernel.org/r/20250425031935.76411-1-link@vivo.com Link: https://lkml.kernel.org/r/20250425031935.76411-2-link@vivo.com Signed-off-by: Huan Yang Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Francesco Valla Cc: guoweikang Cc: Huang Shijie Cc: KP Singh Cc: Michal Hocko Cc: Muchun Song Cc: "Paul E . McKenney" Cc: Petr Mladek Cc: Rasmus Villemoes Cc: Raul E Rangel Cc: Roman Gushchin Cc: "Uladzislau Rezki (Sony)" Cc: Vlastimil Babka Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ac1b003ee5b8..9ed75f82b858 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1057,6 +1057,7 @@ static inline u64 cgroup_id_from_mm(struct mm_struct *mm) return id; } +extern int mem_cgroup_init(void); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1472,6 +1473,8 @@ static inline u64 cgroup_id_from_mm(struct mm_struct *mm) { return 0; } + +static inline int mem_cgroup_init(void) { return 0; } #endif /* CONFIG_MEMCG */ /* -- cgit v1.2.3 From 8d88b0769e256c238f80615f08fc5b7aebc29439 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Wed, 2 Apr 2025 20:56:13 +0000 Subject: mm/hugetlb: use separate nodemask for bootmem allocations Hugetlb boot allocation has used online nodes for allocation since commit de55996d7188 ("mm/hugetlb: use online nodes for bootmem allocation"). This was needed to be able to do the allocations earlier in boot, before N_MEMORY was set. This might lead to a different distribution of gigantic hugepages across NUMA nodes if there are memoryless nodes in the system. What happens is that the memoryless nodes are tried, but then the memblock allocation fails and falls back, which usually means that the node that has the highest physical address available will be used (top-down allocation). While this will end up getting the same number of hugetlb pages, they might not be be distributed the same way. The fallback for each memoryless node might not end up coming from the same node as the successful round-robin allocation from N_MEMORY nodes. While administrators that rely on having a specific number of hugepages per node should use the hugepages=N:X syntax, it's better not to change the old behavior for the plain hugepages=N case. To do this, construct a nodemask for hugetlb bootmem purposes only, containing nodes that have memory. Then use that for round-robin bootmem allocations. This saves some cycles, and the added advantage here is that hugetlb_cma can use it too, avoiding the older issue of pointless attempts to create a CMA area for memoryless nodes (which will also cause the per-node CMA area size to be too small). Link: https://lkml.kernel.org/r/20250402205613.3086864-1-fvdl@google.com Fixes: de55996d7188 ("mm/hugetlb: use online nodes for bootmem allocation") Signed-off-by: Frank van der Linden Reviewed-by: Oscar Salvador Reviewed-by: Luiz Capitulino Cc: David Hildenbrand Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a57bed83c657..23ebf49c5d6a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -14,6 +14,7 @@ #include #include #include +#include struct ctl_table; struct user_struct; @@ -176,6 +177,8 @@ extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); bool hugetlb_bootmem_allocated(void); +extern nodemask_t hugetlb_bootmem_nodes; +void hugetlb_bootmem_set_nodes(void); /* arch callbacks */ -- cgit v1.2.3 From 68a1436bde00b40327efe27926076b53088775a3 Mon Sep 17 00:00:00 2001 From: Zhongkun He Date: Mon, 21 Apr 2025 17:13:28 +0800 Subject: mm: add swappiness=max arg to memory.reclaim for only anon reclaim Patch series "add max arg to swappiness in memory.reclaim and lru_gen", v4. This patchset adds max arg to swappiness in memory.reclaim and lru_gen for anon only proactive memory reclaim. With commit <68cd9050d871> ("mm: add swappiness= arg to memory.reclaim") we can submit an additional swappiness= argument to memory.reclaim. It is very useful because we can dynamically adjust the reclamation ratio based on the anonymous folios and file folios of each cgroup. For example,when swappiness is set to 0, we only reclaim from file folios. But we can not relciam memory just from anon folios. This patchset introduces a new macro, SWAPPINESS_ANON_ONLY, defined as MAX_SWAPPINESS + 1, represent the max arg semantics. It specifically indicates that reclamation should occur only from anonymous pages. Patch 1 adds swappiness=max arg to memory.reclaim suggested-by: Yosry Ahmed Patch 2 add more comments for cache_trim_mode from Johannes Weiner in [1]. Patch 3 add max arg to lru_gen for proactive memory reclaim in MGLRU. The MGLRU already supports reclaiming exclusively from anonymous pages. This patch formalizes that behavior by introducing a max parameter to represent the corresponding semantics. Patch 4 using SWAPPINESS_ANON_ONLY in MGLRU Using SWAPPINESS_ANON_ONLY instead of MAX_SWAPPINESS + 1 to indicate reclaiming only from anonymous pages makes the code more readable and explicit Here is the previous discussion: https://lore.kernel.org/all/20250314033350.1156370-1-hezhongkun.hzk@bytedance.com/ https://lore.kernel.org/all/20250312094337.2296278-1-hezhongkun.hzk@bytedance.com/ https://lore.kernel.org/all/20250318135330.3358345-1-hezhongkun.hzk@bytedance.com/ This patch (of 4): With commit <68cd9050d871> ("mm: add swappiness= arg to memory.reclaim") we can submit an additional swappiness= argument to memory.reclaim. It is very useful because we can dynamically adjust the reclamation ratio based on the anonymous folios and file folios of each cgroup. For example,when swappiness is set to 0, we only reclaim from file folios. However,we have also encountered a new issue: when swappiness is set to the MAX_SWAPPINESS, it may still only reclaim file folios. So, we hope to add a new arg 'swappiness=max' in memory.reclaim where proactive memory reclaim only reclaims from anonymous folios when swappiness is set to max. The swappiness semantics from a user perspective remain unchanged. For example, something like this: echo "2M swappiness=max" > /sys/fs/cgroup/memory.reclaim will perform reclaim on the rootcg with a swappiness setting of 'max' (a new mode) regardless of the file folios. Users have a more comprehensive view of the application's memory distribution because there are many metrics available. For example, if we find that a certain cgroup has a large number of inactive anon folios, we can reclaim only those and skip file folios, because with the zram/zswap, the IO tradeoff that cache_trim_mode or other file first logic is making doesn't hold - file refaults will cause IO, whereas anon decompression will not. With this patch, the swappiness argument of memory.reclaim has a new mode 'max', means reclaiming just from anonymous folios both in traditional LRU and MGLRU. Link: https://lkml.kernel.org/r/cover.1745225696.git.hezhongkun.hzk@bytedance.com Link: https://lore.kernel.org/all/20250314141833.GA1316033@cmpxchg.org/ [1] Link: https://lkml.kernel.org/r/519e12b9b1f8c31a01e228c8b4b91a2419684f77.1745225696.git.hezhongkun.hzk@bytedance.com Signed-off-by: Zhongkun He Suggested-by: Yosry Ahmed Acked-by: Muchun Song Cc: Johannes Weiner Cc: Michal Hocko Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 4e4e27d3ce3d..bc0e1c275fc0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -414,6 +414,10 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #define MEMCG_RECLAIM_PROACTIVE (1 << 2) #define MIN_SWAPPINESS 0 #define MAX_SWAPPINESS 200 + +/* Just recliam from anon folios in proactive memory reclaim */ +#define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1) + extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, -- cgit v1.2.3 From a3365bdca2200e24d815b19382d3f05deb8567ab Mon Sep 17 00:00:00 2001 From: Cheng-Han Wu Date: Sun, 27 Apr 2025 22:50:04 +0800 Subject: mm: remove unused macro INIT_PASID The macro INIT_PASID was originally used by mm_init_pasid. However, since commit a6cbd44093ef ("kernel/fork: Initialize mm's PASID"), mm_init_pasid has been removed. Therefore, INIT_PASID is no longer needed and is removed. Link: https://lkml.kernel.org/r/20250427145004.13049-1-hank20010209@gmail.com Signed-off-by: Cheng-Han Wu Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 56d07edd01f9..e76bade9ebb1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -28,7 +28,6 @@ #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) -#define INIT_PASID 0 struct address_space; struct mem_cgroup; -- cgit v1.2.3 From 4c78cc596bb8d39532f059e0198eeabf370c50f5 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 9 May 2025 00:46:19 -0700 Subject: memblock: add MEMBLOCK_RSRV_KERN flag Patch series "kexec: introduce Kexec HandOver (KHO)", v8. Kexec today considers itself purely a boot loader: When we enter the new kernel, any state the previous kernel left behind is irrelevant and the new kernel reinitializes the system. However, there are use cases where this mode of operation is not what we actually want. In virtualization hosts for example, we want to use kexec to update the host kernel while virtual machine memory stays untouched. When we add device assignment to the mix, we also need to ensure that IOMMU and VFIO states are untouched. If we add PCIe peer to peer DMA, we need to do the same for the PCI subsystem. If we want to kexec while an SEV-SNP enabled virtual machine is running, we need to preserve the VM context pages and physical memory. See "pkernfs: Persisting guest memory and kernel/device state safely across kexec" Linux Plumbers Conference 2023 presentation for details: https://lpc.events/event/17/contributions/1485/ To start us on the journey to support all the use cases above, this patch implements basic infrastructure to allow hand over of kernel state across kexec (Kexec HandOver, aka KHO). As a really simple example target, we use memblock's reserve_mem. With this patchset applied, memory that was reserved using "reserve_mem" command line options remains intact after kexec and it is guaranteed to reside at the same physical address. == Alternatives == There are alternative approaches to (parts of) the problems above: * Memory Pools [1] - preallocated persistent memory region + allocator * PRMEM [2] - resizable persistent memory regions with fixed metadata pointer on the kernel command line + allocator * Pkernfs [3] - preallocated file system for in-kernel data with fixed address location on the kernel command line * PKRAM [4] - handover of user space pages using a fixed metadata page specified via command line All of the approaches above fundamentally have the same problem: They require the administrator to explicitly carve out a physical memory location because they have no mechanism outside of the kernel command line to pass data (including memory reservations) between kexec'ing kernels. KHO provides that base foundation. We will determine later whether we still need any of the approaches above for fast bulk memory handover of for example IOMMU page tables. But IMHO they would all be users of KHO, with KHO providing the foundational primitive to pass metadata and bulk memory reservations as well as provide easy versioning for data. == Overview == We introduce a metadata file that the kernels pass between each other. How they pass it is architecture specific. The file's format is a Flattened Device Tree (fdt) which has a generator and parser already included in Linux. KHO is enabled in the kernel command line by `kho=on`. When the root user enables KHO through /sys/kernel/debug/kho/out/finalize, the kernel invokes callbacks to every KHO users to register preserved memory regions, which contain drivers' states. When the actual kexec happens, the fdt is part of the image set that we boot into. In addition, we keep "scratch regions" available for kexec: physically contiguous memory regions that are guaranteed to not have any memory that KHO would preserve. The new kernel bootstraps itself using the scratch regions and sets all handed over memory as in use. When drivers initialize that support KHO, they introspect the fdt, restore preserved memory regions, and retrieve their states stored in the preserved memory. == Limitations == Currently KHO is only implemented for file based kexec. The kernel interfaces in the patch set are already in place to support user space kexec as well, but it is still not implemented it yet inside kexec tools. == How to Use == To use the code, please boot the kernel with the "kho=on" command line parameter. KHO will automatically create scratch regions. If you want to set the scratch size explicitly you can use "kho_scratch=" command line parameter. For instance, "kho_scratch=16M,512M,256M" will reserve a 16 MiB low memory scratch area, a 512 MiB global scratch region, and 256 MiB per NUMA node scratch regions on boot. Make sure to have a reserved memory range requested with reserv_mem command line option, for example, "reserve_mem=64m:4k:n1". Then before you invoke file based "kexec -l", finalize KHO FDT: # echo 1 > /sys/kernel/debug/kho/out/finalize You can preview the generated FDT using `dtc`, # dtc /sys/kernel/debug/kho/out/fdt # dtc /sys/kernel/debug/kho/out/sub_fdts/memblock `dtc` is available on ubuntu by `sudo apt-get install device-tree-compiler`. Now kexec into the new kernel, # kexec -l Image --initrd=initrd -s # kexec -e (The order of KHO finalization and "kexec -l" does not matter.) The new kernel will boot up and contain the previous kernel's reserve_mem contents at the same physical address as the first kernel. You can also review the FDT passed from the old kernel, # dtc /sys/kernel/debug/kho/in/fdt # dtc /sys/kernel/debug/kho/in/sub_fdts/memblock This patch (of 17): To denote areas that were reserved for kernel use either directly with memblock_reserve_kern() or via memblock allocations. Link: https://lore.kernel.org/lkml/20250424083258.2228122-1-changyuanl@google.com/ Link: https://lore.kernel.org/lkml/aAeaJ2iqkrv_ffhT@kernel.org/ Link: https://lore.kernel.org/lkml/35c58191-f774-40cf-8d66-d1e2aaf11a62@intel.com/ Link: https://lore.kernel.org/lkml/20250424093302.3894961-1-arnd@kernel.org/ Link: https://lkml.kernel.org/r/20250509074635.3187114-1-changyuanl@google.com Link: https://lkml.kernel.org/r/20250509074635.3187114-2-changyuanl@google.com Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Alexander Graf Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Cc: Dave Hansen Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- include/linux/memblock.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ef5a1ecc6e59..6c00fbc08513 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -42,6 +42,9 @@ extern unsigned long long max_possible_pfn; * kernel resource tree. * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are * not initialized (only for reserved regions). + * @MEMBLOCK_RSRV_KERN: memory region that is reserved for kernel use, + * either explictitly with memblock_reserve_kern() or via memblock + * allocation APIs. All memblock allocations set this flag. */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ @@ -50,6 +53,7 @@ enum memblock_flags { MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */ + MEMBLOCK_RSRV_KERN = 0x20, /* memory reserved for kernel use */ }; /** @@ -116,7 +120,19 @@ int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid, int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_phys_free(phys_addr_t base, phys_addr_t size); -int memblock_reserve(phys_addr_t base, phys_addr_t size); +int __memblock_reserve(phys_addr_t base, phys_addr_t size, int nid, + enum memblock_flags flags); + +static __always_inline int memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + return __memblock_reserve(base, size, NUMA_NO_NODE, 0); +} + +static __always_inline int memblock_reserve_kern(phys_addr_t base, phys_addr_t size) +{ + return __memblock_reserve(base, size, NUMA_NO_NODE, MEMBLOCK_RSRV_KERN); +} + #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP int memblock_physmem_add(phys_addr_t base, phys_addr_t size); #endif @@ -476,6 +492,7 @@ static inline __init_memblock bool memblock_bottom_up(void) phys_addr_t memblock_phys_mem_size(void); phys_addr_t memblock_reserved_size(void); +phys_addr_t memblock_reserved_kern_size(phys_addr_t limit, int nid); unsigned long memblock_estimated_nr_free_pages(void); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); -- cgit v1.2.3 From d59f43b5748092557d34244e29a618221a250501 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 May 2025 00:46:20 -0700 Subject: memblock: add support for scratch memory With KHO (Kexec HandOver), we need a way to ensure that the new kernel does not allocate memory on top of any memory regions that the previous kernel was handing over. But to know where those are, we need to include them in the memblock.reserved array which may not be big enough to hold all ranges that need to be persisted across kexec. To resize the array, we need to allocate memory. That brings us into a catch 22 situation. The solution to that is limit memblock allocations to the scratch regions: safe regions to operate in the case when there is memory that should remain intact across kexec. KHO provides several "scratch regions" as part of its metadata. These scratch regions are contiguous memory blocks that known not to contain any memory that should be persisted across kexec. These regions should be large enough to accommodate all memblock allocations done by the kexeced kernel. We introduce a new memblock_set_scratch_only() function that allows KHO to indicate that any memblock allocation must happen from the scratch regions. Later, we may want to perform another KHO kexec. For that, we reuse the same scratch regions. To ensure that no eventually handed over data gets allocated inside a scratch region, we flip the semantics of the scratch region with memblock_clear_scratch_only(): After that call, no allocations may happen from scratch memblock regions. We will lift that restriction in the next patch. Link: https://lkml.kernel.org/r/20250509074635.3187114-3-changyuanl@google.com Signed-off-by: Alexander Graf Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Changyuan Lyu Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memblock.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6c00fbc08513..993937a6b962 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -45,6 +45,11 @@ extern unsigned long long max_possible_pfn; * @MEMBLOCK_RSRV_KERN: memory region that is reserved for kernel use, * either explictitly with memblock_reserve_kern() or via memblock * allocation APIs. All memblock allocations set this flag. + * @MEMBLOCK_KHO_SCRATCH: memory region that kexec can pass to the next + * kernel in handover mode. During early boot, we do not know about all + * memory reservations yet, so we get scratch memory from the previous + * kernel that we know is good to use. It is the only memory that + * allocations may happen from in this phase. */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ @@ -54,6 +59,7 @@ enum memblock_flags { MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */ MEMBLOCK_RSRV_KERN = 0x20, /* memory reserved for kernel use */ + MEMBLOCK_KHO_SCRATCH = 0x40, /* scratch memory for kexec handover */ }; /** @@ -148,6 +154,8 @@ int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); +int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size); +int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size); void memblock_free(void *ptr, size_t size); void reset_all_zones_managed_pages(void); @@ -291,6 +299,11 @@ static inline bool memblock_is_driver_managed(struct memblock_region *m) return m->flags & MEMBLOCK_DRIVER_MANAGED; } +static inline bool memblock_is_kho_scratch(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_KHO_SCRATCH; +} + int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, @@ -619,5 +632,12 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } static inline void memtest_report_meminfo(struct seq_file *m) { } #endif +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +void memblock_set_kho_scratch_only(void); +void memblock_clear_kho_scratch_only(void); +#else +static inline void memblock_set_kho_scratch_only(void) { } +static inline void memblock_clear_kho_scratch_only(void) { } +#endif #endif /* _LINUX_MEMBLOCK_H */ -- cgit v1.2.3 From b8a8f96a6dce527ad316184ff1e20f238ed413d8 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 9 May 2025 00:46:21 -0700 Subject: memblock: introduce memmap_init_kho_scratch() With deferred initialization of struct page it will be necessary to initialize memory map for KHO scratch regions early. Add memmap_init_kho_scratch() method that will allow such initialization in upcoming patches. Link: https://lkml.kernel.org/r/20250509074635.3187114-4-changyuanl@google.com Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Changyuan Lyu Cc: Alexander Graf Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memblock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 993937a6b962..bb19a2534224 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -635,9 +635,11 @@ static inline void memtest_report_meminfo(struct seq_file *m) { } #ifdef CONFIG_MEMBLOCK_KHO_SCRATCH void memblock_set_kho_scratch_only(void); void memblock_clear_kho_scratch_only(void); +void memmap_init_kho_scratch_pages(void); #else static inline void memblock_set_kho_scratch_only(void) { } static inline void memblock_clear_kho_scratch_only(void) { } +static inline void memmap_init_kho_scratch_pages(void) {} #endif #endif /* _LINUX_MEMBLOCK_H */ -- cgit v1.2.3 From 3dc92c311498c4d307cfdd0c6c3ac9355b50f683 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 May 2025 00:46:22 -0700 Subject: kexec: add Kexec HandOver (KHO) generation helpers Add the infrastructure to generate Kexec HandOver metadata. Kexec HandOver is a mechanism that allows Linux to preserve state - arbitrary properties as well as memory locations - across kexec. It does so using 2 concepts: 1) KHO FDT - Every KHO kexec carries a KHO specific flattened device tree blob that describes preserved memory regions. Device drivers can register to KHO to serialize and preserve their states before kexec. 2) Scratch Regions - CMA regions that we allocate in the first kernel. CMA gives us the guarantee that no handover pages land in those regions, because handover pages must be at a static physical memory location. We use these regions as the place to load future kexec images so that they won't collide with any handover data. Link: https://lkml.kernel.org/r/20250509074635.3187114-5-changyuanl@google.com Signed-off-by: Alexander Graf Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Pratyush Yadav Signed-off-by: Pratyush Yadav Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 59 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 include/linux/kexec_handover.h (limited to 'include/linux') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h new file mode 100644 index 000000000000..2e19004776f6 --- /dev/null +++ b/include/linux/kexec_handover.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_H +#define LINUX_KEXEC_HANDOVER_H + +#include +#include + +struct kho_scratch { + phys_addr_t addr; + phys_addr_t size; +}; + +/* KHO Notifier index */ +enum kho_event { + KEXEC_KHO_FINALIZE = 0, + KEXEC_KHO_ABORT = 1, +}; + +struct notifier_block; + +struct kho_serialization; + +#ifdef CONFIG_KEXEC_HANDOVER +bool kho_is_enabled(void); + +int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); + +int register_kho_notifier(struct notifier_block *nb); +int unregister_kho_notifier(struct notifier_block *nb); + +void kho_memory_init(void); +#else +static inline bool kho_is_enabled(void) +{ + return false; +} + +static inline int kho_add_subtree(struct kho_serialization *ser, + const char *name, void *fdt) +{ + return -EOPNOTSUPP; +} + +static inline int register_kho_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline int unregister_kho_notifier(struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} + +static inline void kho_memory_init(void) +{ +} +#endif /* CONFIG_KEXEC_HANDOVER */ + +#endif /* LINUX_KEXEC_HANDOVER_H */ -- cgit v1.2.3 From c609c144b0e8dbc19712ff8c8a0929be38afe58d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 May 2025 00:46:23 -0700 Subject: kexec: add KHO parsing support When we have a KHO kexec, we get an FDT blob and scratch region to populate the state of the system. Provide helper functions that allow architecture code to easily handle memory reservations based on them and give device drivers visibility into the KHO FDT and memory reservations so they can recover their own state. Include a fix from Arnd Bergmann https://lore.kernel.org/lkml/20250424093302.3894961-1-arnd@kernel.org/. Link: https://lkml.kernel.org/r/20250509074635.3187114-6-changyuanl@google.com Signed-off-by: Alexander Graf Signed-off-by: Arnd Bergmann Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 2e19004776f6..02dcfc8c427e 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -24,11 +24,15 @@ struct kho_serialization; bool kho_is_enabled(void); int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); +int kho_retrieve_subtree(const char *name, phys_addr_t *phys); int register_kho_notifier(struct notifier_block *nb); int unregister_kho_notifier(struct notifier_block *nb); void kho_memory_init(void); + +void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, + u64 scratch_len); #else static inline bool kho_is_enabled(void) { @@ -41,6 +45,11 @@ static inline int kho_add_subtree(struct kho_serialization *ser, return -EOPNOTSUPP; } +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +{ + return -EOPNOTSUPP; +} + static inline int register_kho_notifier(struct notifier_block *nb) { return -EOPNOTSUPP; @@ -54,6 +63,11 @@ static inline int unregister_kho_notifier(struct notifier_block *nb) static inline void kho_memory_init(void) { } + +static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, + phys_addr_t scratch_phys, u64 scratch_len) +{ +} #endif /* CONFIG_KEXEC_HANDOVER */ #endif /* LINUX_KEXEC_HANDOVER_H */ -- cgit v1.2.3 From fc33e4b44b2717feba2f6f07ce7943a96499c9ec Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 9 May 2025 00:46:24 -0700 Subject: kexec: enable KHO support for memory preservation Introduce APIs allowing KHO users to preserve memory across kexec and get access to that memory after boot of the kexeced kernel kho_preserve_folio() - record a folio to be preserved over kexec kho_restore_folio() - recreates the folio from the preserved memory kho_preserve_phys() - record physically contiguous range to be preserved over kexec. The memory preservations are tracked by two levels of xarrays to manage chunks of per-order 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a 1TB x86 system would fit inside a single 512 byte bitmap. For order 0 allocations each bitmap will cover 16M of address space. Thus, for 16G of memory at most 512K of bitmap memory will be needed for order 0. At serialization time all bitmaps are recorded in a linked list of pages for the next kernel to process and the physical address of the list is recorded in KHO FDT. The next kernel then processes that list, reserves the memory ranges and later, when a user requests a folio or a physical range, KHO restores corresponding memory map entries. Link: https://lkml.kernel.org/r/20250509074635.3187114-7-changyuanl@google.com Suggested-by: Jason Gunthorpe Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Alexander Graf Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 02dcfc8c427e..348844cffb13 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -16,13 +16,34 @@ enum kho_event { KEXEC_KHO_ABORT = 1, }; +struct folio; struct notifier_block; +#define DECLARE_KHOSER_PTR(name, type) \ + union { \ + phys_addr_t phys; \ + type ptr; \ + } name +#define KHOSER_STORE_PTR(dest, val) \ + ({ \ + typeof(val) v = val; \ + typecheck(typeof((dest).ptr), v); \ + (dest).phys = virt_to_phys(v); \ + }) +#define KHOSER_LOAD_PTR(src) \ + ({ \ + typeof(src) s = src; \ + (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ + }) + struct kho_serialization; #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); +int kho_preserve_folio(struct folio *folio); +int kho_preserve_phys(phys_addr_t phys, size_t size); +struct folio *kho_restore_folio(phys_addr_t phys); int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); @@ -39,6 +60,21 @@ static inline bool kho_is_enabled(void) return false; } +static inline int kho_preserve_folio(struct folio *folio) +{ + return -EOPNOTSUPP; +} + +static inline int kho_preserve_phys(phys_addr_t phys, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline struct folio *kho_restore_folio(phys_addr_t phys) +{ + return NULL; +} + static inline int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) { -- cgit v1.2.3 From 3bdecc3c93f9f68d11ed54971dde169b6ead9d78 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 9 May 2025 00:46:25 -0700 Subject: kexec: add KHO support to kexec file loads Kexec has 2 modes: A user space driven mode and a kernel driven mode. For the kernel driven mode, kernel code determines the physical addresses of all target buffers that the payload gets copied into. With KHO, we can only safely copy payloads into the "scratch area". Teach the kexec file loader about it, so it only allocates for that area. In addition, enlighten it with support to ask the KHO subsystem for its respective payloads to copy into target memory. Also teach the KHO subsystem how to fill the images for file loads. Link: https://lkml.kernel.org/r/20250509074635.3187114-8-changyuanl@google.com Signed-off-by: Alexander Graf Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Changyuan Lyu Signed-off-by: Changyuan Lyu Cc: Andy Lutomirski Cc: Anthony Yznaga Cc: Arnd Bergmann Cc: Ashish Kalra Cc: Ben Herrenschmidt Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David Woodhouse Cc: Eric Biederman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Gowans Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Marc Rutland Cc: Paolo Bonzini Cc: Pasha Tatashin Cc: Peter Zijlstra Cc: Pratyush Yadav Cc: Rob Herring Cc: Saravana Kannan Cc: Stanislav Kinsburskii Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Thomas Lendacky Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/kexec.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index c8971861521a..075255de8154 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -371,6 +371,11 @@ struct kimage { size_t ima_buffer_size; #endif + struct { + struct kexec_segment *scratch; + phys_addr_t fdt; + } kho; + /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_sz; -- cgit v1.2.3 From f88ce2c84a341f44a7d00bc10868714bc4751f7e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Apr 2025 14:33:37 +0100 Subject: mm: introduce for_each_valid_pfn() and use it from reserve_bootmem_region() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: Introduce for_each_valid_pfn()", v4. There are cases where a naïve loop over a PFN range, calling pfn_valid() on each one, is horribly inefficient. Ruihan Li reported the case where memmap_init() iterates all the way from zero to a potentially large value of ARCH_PFN_OFFSET, and we at Amazon found the reserve_bootmem_region() one as it affects hypervisor live update. Others are more cosmetic. By introducing a for_each_valid_pfn() helper it can optimise away a lot of pointless calls to pfn_valid(), skipping immediately to the next valid PFN and also skipping *all* checks within a valid (sub)region according to the granularity of the memory model in use. This patch (of 7) Especially since commit 9092d4f7a1f8 ("memblock: update initialization of reserved pages"), the reserve_bootmem_region() function can spend a significant amount of time iterating over every 4KiB PFN in a range, calling pfn_valid() on each one, and ultimately doing absolutely nothing. On a platform used for virtualization, with large NOMAP regions that eventually get used for guest RAM, this leads to a significant increase in steal time experienced during kexec for a live update. Introduce for_each_valid_pfn() and use it from reserve_bootmem_region(). This implementation is precisely the same naïve loop that the functio used to have, but subsequent commits will provide optimised versions for FLATMEM and SPARSEMEM, and this version will remain for those architectures which provide their own pfn_valid() implementation, until/unless they also provide a matching for_each_valid_pfn(). Link: https://lkml.kernel.org/r/20250423133821.789413-1-dwmw2@infradead.org Link: https://lkml.kernel.org/r/20250423133821.789413-2-dwmw2@infradead.org Signed-off-by: David Woodhouse Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Marc Rutland Cc: Marc Zyngier Cc: Ruihan Li Cc: Will Deacon Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6ccec1bf2896..230a29c2ed1a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2177,6 +2177,16 @@ void sparse_init(void); #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ +/* + * Fallback case for when the architecture provides its own pfn_valid() but + * not a corresponding for_each_valid_pfn(). + */ +#ifndef for_each_valid_pfn +#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \ + for ((_pfn) = (_start_pfn); (_pfn) < (_end_pfn); (_pfn)++) \ + if (pfn_valid(_pfn)) +#endif + #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */ -- cgit v1.2.3 From 037926316c9ded1194927ee56b862e3a1450aaf3 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 23 Apr 2025 14:33:39 +0100 Subject: mm: implement for_each_valid_pfn() for CONFIG_SPARSEMEM Implement for_each_valid_pfn() based on two helper functions. The first_valid_pfn() function largely mirrors pfn_valid(), calling into a pfn_section_first_valid() helper which is trivial for the !VMEMMAP case, and in the VMEMMAP case will skip to the next subsection as needed. Since next_valid_pfn() knows that its argument *is* a valid PFN, it doesn't need to do any checking at all while iterating over the low bits within a (sub)section mask; the whole (sub)section is either present or not. Note that the VMEMMAP version of pfn_section_first_valid() may return a value *higher* than end_pfn when skipping to the next subsection, and first_valid_pfn() happily returns that higher value. This is fine. [dwmw2@infradead.org: fix next_valid_pfn() for sparsemem] Link: https://lkml.kernel.org/r/c15100fcf6781a60b852c4dbb43bdc98a678fcf0.camel@infradead.org Link: https://lkml.kernel.org/r/20250423133821.789413-4-dwmw2@infradead.org Signed-off-by: David Woodhouse Reviewed-by: Mike Rapoport (Microsoft) Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: David Hildenbrand Cc: Marc Rutland Cc: Marc Zyngier Cc: Ruihan Li Cc: Will Deacon Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 230a29c2ed1a..b19a98c20de8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2075,11 +2075,37 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) return usage ? test_bit(idx, usage->subsection_map) : 0; } + +static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn) +{ + struct mem_section_usage *usage = READ_ONCE(ms->usage); + int idx = subsection_map_index(*pfn); + unsigned long bit; + + if (!usage) + return false; + + if (test_bit(idx, usage->subsection_map)) + return true; + + /* Find the next subsection that exists */ + bit = find_next_bit(usage->subsection_map, SUBSECTIONS_PER_SECTION, idx); + if (bit == SUBSECTIONS_PER_SECTION) + return false; + + *pfn = (*pfn & PAGE_SECTION_MASK) + (bit * PAGES_PER_SUBSECTION); + return true; +} #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } + +static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn) +{ + return true; +} #endif void sparse_init_early_section(int nid, struct page *map, unsigned long pnum, @@ -2128,6 +2154,58 @@ static inline int pfn_valid(unsigned long pfn) return ret; } + +/* Returns end_pfn or higher if no valid PFN remaining in range */ +static inline unsigned long first_valid_pfn(unsigned long pfn, unsigned long end_pfn) +{ + unsigned long nr = pfn_to_section_nr(pfn); + + rcu_read_lock_sched(); + + while (nr <= __highest_present_section_nr && pfn < end_pfn) { + struct mem_section *ms = __pfn_to_section(pfn); + + if (valid_section(ms) && + (early_section(ms) || pfn_section_first_valid(ms, &pfn))) { + rcu_read_unlock_sched(); + return pfn; + } + + /* Nothing left in this section? Skip to next section */ + nr++; + pfn = section_nr_to_pfn(nr); + } + + rcu_read_unlock_sched(); + return end_pfn; +} + +static inline unsigned long next_valid_pfn(unsigned long pfn, unsigned long end_pfn) +{ + pfn++; + + if (pfn >= end_pfn) + return end_pfn; + + /* + * Either every PFN within the section (or subsection for VMEMMAP) is + * valid, or none of them are. So there's no point repeating the check + * for every PFN; only call first_valid_pfn() again when crossing a + * (sub)section boundary (i.e. !(pfn & ~PAGE_{SUB,}SECTION_MASK)). + */ + if (pfn & ~(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) ? + PAGE_SUBSECTION_MASK : PAGE_SECTION_MASK)) + return pfn; + + return first_valid_pfn(pfn, end_pfn); +} + + +#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \ + for ((_pfn) = first_valid_pfn((_start_pfn), (_end_pfn)); \ + (_pfn) < (_end_pfn); \ + (_pfn) = next_valid_pfn((_pfn), (_end_pfn))) + #endif static inline int pfn_in_present_section(unsigned long pfn) -- cgit v1.2.3 From 4428a35f91f0f0e31d874038b3091e1c5a461f34 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Thu, 24 Apr 2025 23:56:06 +0800 Subject: mm/rmap: inline folio_test_large_maybe_mapped_shared() into callers To prevent the function from being used when CONFIG_MM_ID is disabled, we intend to inline it into its few callers, which also would help maintain the expected code placement. Link: https://lkml.kernel.org/r/20250424155606.57488-1-lance.yang@linux.dev Signed-off-by: Lance Yang Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Mingzhe Yang Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/page-flags.h | 4 ---- include/linux/rmap.h | 2 +- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9b701cfbef22..21dd110b6655 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2111,7 +2111,7 @@ static inline bool folio_maybe_mapped_shared(struct folio *folio) */ if (mapcount <= 1) return false; - return folio_test_large_maybe_mapped_shared(folio); + return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d3909cb1e576..37b11f15dbd9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1230,10 +1230,6 @@ static inline int folio_has_private(const struct folio *folio) return !!(folio->flags & PAGE_FLAGS_PRIVATE); } -static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio) -{ - return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); -} #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6b82b618846e..c4f4903b1088 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -223,7 +223,7 @@ static inline void __folio_large_mapcount_sanity_checks(const struct folio *foli VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY && folio->_mm_id_mapcount[1] < 0); VM_WARN_ON_ONCE(!folio_mapped(folio) && - folio_test_large_maybe_mapped_shared(folio)); + test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)); } static __always_inline void folio_set_large_mapcount(struct folio *folio, -- cgit v1.2.3 From 60309008e1e2b2d4bff3c2475b0c74faf395f787 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 28 Apr 2025 10:27:54 +0300 Subject: util_macros.h: make the header more resilient Add missing header inclusions. Link: https://lkml.kernel.org/r/20250428072754.3265274-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton --- include/linux/util_macros.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 3b570b765b75..76ca2b83c13e 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -2,7 +2,10 @@ #ifndef _LINUX_HELPER_MACROS_H_ #define _LINUX_HELPER_MACROS_H_ +#include #include +#include +#include /** * for_each_if - helper for handling conditionals in various for_each macros -- cgit v1.2.3 From 86ebd50224c0734d965843260d0dc057a9431c61 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Wed, 30 Apr 2025 10:01:51 +0000 Subject: mm: add folio_expected_ref_count() for reference count calculation Patch series " JFS: Implement migrate_folio for jfs_metapage_aops" v5. This patchset addresses a warning that occurs during memory compaction due to JFS's missing migrate_folio operation. The warning was introduced by commit 7ee3647243e5 ("migrate: Remove call to ->writepage") which added explicit warnings when filesystem don't implement migrate_folio. The syzbot reported following [1]: jfs_metapage_aops does not implement migrate_folio WARNING: CPU: 1 PID: 5861 at mm/migrate.c:955 fallback_migrate_folio mm/migrate.c:953 [inline] WARNING: CPU: 1 PID: 5861 at mm/migrate.c:955 move_to_new_folio+0x70e/0x840 mm/migrate.c:1007 Modules linked in: CPU: 1 UID: 0 PID: 5861 Comm: syz-executor280 Not tainted 6.15.0-rc1-next-20250411-syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 RIP: 0010:fallback_migrate_folio mm/migrate.c:953 [inline] RIP: 0010:move_to_new_folio+0x70e/0x840 mm/migrate.c:1007 To fix this issue, this series implement metapage_migrate_folio() for JFS which handles both single and multiple metapages per page configurations. While most filesystems leverage existing migration implementations like filemap_migrate_folio(), buffer_migrate_folio_norefs() or buffer_migrate_folio() (which internally used folio_expected_refs()), JFS's metapage architecture requires special handling of its private data during migration. To support this, this series introduce the folio_expected_ref_count(), which calculates external references to a folio from page/swap cache, private data, and page table mappings. This standardized implementation replaces the previous ad-hoc folio_expected_refs() function and enables JFS to accurately determine whether a folio has unexpected references before attempting migration. Implement folio_expected_ref_count() to calculate expected folio reference counts from: - Page/swap cache (1 per page) - Private data (1) - Page table mappings (1 per map) While originally needed for page migration operations, this improved implementation standardizes reference counting by consolidating all refcount contributors into a single, reusable function that can benefit any subsystem needing to detect unexpected references to folios. The folio_expected_ref_count() returns the sum of these external references without including any reference the caller itself might hold. Callers comparing against the actual folio_ref_count() must account for their own references separately. Link: https://syzkaller.appspot.com/bug?extid=8bb6fd945af4e0ad9299 [1] Link: https://lkml.kernel.org/r/20250430100150.279751-1-shivankg@amd.com Link: https://lkml.kernel.org/r/20250430100150.279751-2-shivankg@amd.com Signed-off-by: David Hildenbrand Signed-off-by: Shivank Garg Suggested-by: Matthew Wilcox Co-developed-by: David Hildenbrand Cc: Alistair Popple Cc: Dave Kleikamp Cc: Donet Tom Cc: Jane Chu Cc: Kefeng Wang Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 21dd110b6655..1d1953e37baa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2114,6 +2114,61 @@ static inline bool folio_maybe_mapped_shared(struct folio *folio) return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); } +/** + * folio_expected_ref_count - calculate the expected folio refcount + * @folio: the folio + * + * Calculate the expected folio refcount, taking references from the pagecache, + * swapcache, PG_private and page table mappings into account. Useful in + * combination with folio_ref_count() to detect unexpected references (e.g., + * GUP or other temporary references). + * + * Does currently not consider references from the LRU cache. If the folio + * was isolated from the LRU (which is the case during migration or split), + * the LRU cache does not apply. + * + * Calling this function on an unmapped folio -- !folio_mapped() -- that is + * locked will return a stable result. + * + * Calling this function on a mapped folio will not result in a stable result, + * because nothing stops additional page table mappings from coming (e.g., + * fork()) or going (e.g., munmap()). + * + * Calling this function without the folio lock will also not result in a + * stable result: for example, the folio might get dropped from the swapcache + * concurrently. + * + * However, even when called without the folio lock or on a mapped folio, + * this function can be used to detect unexpected references early (for example, + * if it makes sense to even lock the folio and unmap it). + * + * The caller must add any reference (e.g., from folio_try_get()) it might be + * holding itself to the result. + * + * Returns the expected folio refcount. + */ +static inline int folio_expected_ref_count(const struct folio *folio) +{ + const int order = folio_order(folio); + int ref_count = 0; + + if (WARN_ON_ONCE(folio_test_slab(folio))) + return 0; + + if (folio_test_anon(folio)) { + /* One reference per page from the swapcache. */ + ref_count += folio_test_swapcache(folio) << order; + } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) { + /* One reference per page from the pagecache. */ + ref_count += !!folio->mapping << order; + /* One reference from PG_private. */ + ref_count += folio_test_private(folio); + } + + /* One reference per page table mapping. */ + return ref_count + folio_mapcount(folio); +} + #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { -- cgit v1.2.3 From 6c36ac1e124f1be97cf0485a220865fce5a2020d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Apr 2025 16:28:14 +0100 Subject: mm: establish mm/vma_exec.c for shared exec/mm VMA functionality Patch series "move all VMA allocation, freeing and duplication logic to mm", v3. Currently VMA allocation, freeing and duplication exist in kernel/fork.c, which is a violation of separation of concerns, and leaves these functions exposed to the rest of the kernel when they are in fact internal implementation details. Resolve this by moving this logic to mm, and making it internal to vma.c, vma.h. This also allows us, in future, to provide userland testing around this functionality. We additionally abstract dup_mmap() to mm, being careful to ensure kernel/fork.c acceses this via the mm internal header so it is not exposed elsewhere in the kernel. As part of this change, also abstract initial stack allocation performed in __bprm_mm_init() out of fs code into mm via the create_init_stack_vma(), as this code uses vm_area_alloc() and vm_area_free(). In order to do so sensibly, we introduce a new mm/vma_exec.c file, which contains the code that is shared by mm and exec. This file is added to both memory mapping and exec sections in MAINTAINERS so both sets of maintainers can maintain oversight. As part of this change, we also move relocate_vma_down() to mm/vma_exec.c so all shared mm/exec functionality is kept in one place. We add code shared between nommu and mmu-enabled configurations in order to share VMA allocation, freeing and duplication code correctly while also keeping these functions available in userland VMA testing. This is achieved by adding a mm/vma_init.c file which is also compiled by the userland tests. This patch (of 4): There is functionality that overlaps the exec and memory mapping subsystems. While it properly belongs in mm, it is important that exec maintainers maintain oversight of this functionality correctly. We can establish both goals by adding a new mm/vma_exec.c file which contains these 'glue' functions, and have fs/exec.c import them. As a part of this change, to ensure that proper oversight is achieved, add the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections. scripts/get_maintainer.pl can correctly handle files in multiple entries and this neatly handles the cross-over. [akpm@linux-foundation.org: fix comment typo] Link: https://lkml.kernel.org/r/80f0d0c6-0b68-47f9-ab78-0ab7f74677fc@lucifer.local Link: https://lkml.kernel.org/r/cover.1745853549.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/91f2cee8f17d65214a9d83abb7011aa15f1ea690.1745853549.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Reviewed-by: Pedro Falcato Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1d1953e37baa..43748c8f3454 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3278,7 +3278,6 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, bool write); -- cgit v1.2.3 From 69eadd6a05409ca3725cabf8d60ccf6c8f87e193 Mon Sep 17 00:00:00 2001 From: Guilherme Giacomo Simoes Date: Mon, 28 Apr 2025 17:14:09 -0300 Subject: mm: page-flags-layout.h: change the KASAN_TAG_WIDTH for HW_TAGS KASAN_TAG_WIDTH is 8 bits for both (HW_TAGS and SW_TAGS), but for HW_TAGS the KASAN_TAG_WIDTH can be 4 bits bits because due to the design of the MTE the memory words for storing metadata only need 4 bits. Change the preprocessor define KASAN_TAG_WIDTH for check if SW_TAGS is define, so KASAN_TAG_WIDTH should be 8 bits, but if HW_TAGS is define, so KASAN_TAG_WIDTH should be 4 bits to save a few flags bits. Link: https://lkml.kernel.org/r/20250428201409.5482-1-trintaeoitogc@gmail.com Signed-off-by: Guilherme Giacomo Simoes Suggested-by: Andrey Konovalov Reviewed-by: Andrey Konovalov Cc: Pasha Tatashin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/page-flags-layout.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 4f5c9e979bb9..760006b1c480 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -72,8 +72,10 @@ #define NODE_NOT_IN_PAGE_FLAGS 1 #endif -#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +#if defined(CONFIG_KASAN_SW_TAGS) #define KASAN_TAG_WIDTH 8 +#elif defined(CONFIG_KASAN_HW_TAGS) +#define KASAN_TAG_WIDTH 4 #else #define KASAN_TAG_WIDTH 0 #endif -- cgit v1.2.3 From 2b80f633c360ce3c56a7071782eae70852c8f344 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 1 May 2025 02:10:50 +0800 Subject: filemap: do not use folio_contains for swap cache folios Currently, none of the folio_contains callers should encounter swap cache folios. For fs/ callers, swap cache folios are never part of their workflow. For filemap and truncate, folio_contains is only used for sanity checks to verify the folio index matches the expected lookup / invalidation target. The swap cache does not utilize filemap or truncate helpers in ways that would trigger these checks, as it mostly implements its own cache management. Shmem won't trigger these sanity checks either unless thing went wrong, as it would directly trigger a BUG because swap cache index are unrelated and almost never matches shmem index. Shmem have to handle mixed values of folios, shadows, and swap entries, so it has its own way of handling the mapping. While some filemap helpers works for swap cache space, the swap cache is different from the page cache in many ways. So this particular helper will unlikely to work in a helpful way for swap cache folios. So make it explicit here that folio_contains should not be used for swap cache folios. This helps to avoid misuse, make swap cache less exposed and remove the folio_index usage here. [akpm@linux-foundation.org: s/VM_WARN_ON_FOLIO/VM_WARN_ON_ONCE_FOLIO/, per Kairui] Link: https://lkml.kernel.org/r/20250430181052.55698-5-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: David Hildenbrand Cc: Chao Yu Cc: Chris Li Cc: Chris Mason Cc: Christian Brauner Cc: David Sterba Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jaegeuk Kim Cc: Joanne Koong Cc: Johannes Weiner Cc: Josef Bacik Cc: Matthew Wilcox (Oracle) Cc: Miklos Szeredi Cc: Nhat Pham Cc: Qu Wenruo Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index af25fb640463..b2c2ff8de046 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -935,14 +935,14 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) * @folio: The folio. * @index: The page index within the file. * - * Context: The caller should have the page locked in order to prevent - * (eg) shmem from moving the page between the page cache and swap cache - * and changing its index in the middle of the operation. + * Context: The caller should have the folio locked and ensure + * e.g., shmem did not move this folio to the swap cache. * Return: true or false. */ static inline bool folio_contains(struct folio *folio, pgoff_t index) { - return index - folio_index(folio) < folio_nr_pages(folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + return index - folio->index < folio_nr_pages(folio); } unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, -- cgit v1.2.3 From 7d0f0f06153116bb737eef76d47406639e161613 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 1 May 2025 02:10:51 +0800 Subject: mm: move folio_index to mm/swap.h and remove no longer needed helper There are no remaining users of folio_index() outside the mm subsystem. Move it to mm/swap.h to co-locate it with swap_cache_index(), eliminating a forward declaration, and a function call overhead. Also remove the helper that was used to fix circular header dependency issue. Link: https://lkml.kernel.org/r/20250430181052.55698-6-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: David Hildenbrand Cc: Chao Yu Cc: Chris Li Cc: Chris Mason Cc: Christian Brauner Cc: David Sterba Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jaegeuk Kim Cc: Joanne Koong Cc: Johannes Weiner Cc: Josef Bacik Cc: Matthew Wilcox (Oracle) Cc: Miklos Szeredi Cc: Nhat Pham Cc: Qu Wenruo Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b2c2ff8de046..5d66786867eb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -884,26 +884,6 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -extern pgoff_t __folio_swap_cache_index(struct folio *folio); - -/** - * folio_index - File index of a folio. - * @folio: The folio. - * - * For a folio which is either in the page cache or the swap cache, - * return its index within the address_space it belongs to. If you know - * the page is definitely in the page cache, you can look at the folio's - * index directly. - * - * Return: The index (offset in units of pages) of a folio in its file. - */ -static inline pgoff_t folio_index(struct folio *folio) -{ - if (unlikely(folio_test_swapcache(folio))) - return __folio_swap_cache_index(folio); - return folio->index; -} - /** * folio_next_index - Get the index of the next folio. * @folio: The current folio. -- cgit v1.2.3 From dd309bfc68efc4522b4d33a95e92afff249e103b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 1 May 2025 02:10:52 +0800 Subject: mm, swap: remove no longer used swap mapping helper This helper existed to fix the circular header dependency issue but it is no longer used since commit 0d40cfe63a2f ("fs: remove folio_file_mapping()"), remove it. Link: https://lkml.kernel.org/r/20250430181052.55698-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Chao Yu Cc: Chris Li Cc: Chris Mason Cc: Christian Brauner Cc: David Sterba Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jaegeuk Kim Cc: Joanne Koong Cc: Johannes Weiner Cc: Josef Bacik Cc: Miklos Szeredi Cc: Nhat Pham Cc: Qu Wenruo Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 5d66786867eb..d2ced9920992 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -533,7 +533,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) } struct address_space *folio_mapping(struct folio *); -struct address_space *swapcache_mapping(struct folio *); /** * folio_flush_mapping - Find the file mapping this folio belongs to. -- cgit v1.2.3 From 0aa7b390fc40a871267a2328bbbefca8b37ad307 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Thu, 24 Apr 2025 16:25:25 +0300 Subject: mtd: core: always create master device Create master device without partition when CONFIG_MTD_PARTITIONED_MASTER flag is unset. This streamlines device tree and allows to anchor runtime power management on master device in all cases. Signed-off-by: Alexander Usyskin Signed-off-by: Miquel Raynal --- include/linux/mtd/partitions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index b74a539ec581..5daf80df9e89 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -108,7 +108,7 @@ extern void deregister_mtd_parser(struct mtd_part_parser *parser); deregister_mtd_parser) int mtd_add_partition(struct mtd_info *master, const char *name, - long long offset, long long length); + long long offset, long long length, struct mtd_info **part); int mtd_del_partition(struct mtd_info *master, int partno); uint64_t mtd_get_device_size(const struct mtd_info *mtd); -- cgit v1.2.3 From 598995027b9181ada81789bf01fb8ef30d93c9dc Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Tue, 6 May 2025 21:57:31 +0100 Subject: soc: samsung: exynos-pmu: enable CPU hotplug support for gs101 Some additional register writes are required when hotplugging CPUs on gs101, without these the system hangs when hotplugging. Specifically a CPU_INFORM register needs to be programmed with a hint value which is used by the EL3 firmware (el3mon) and the pmu-intr-gen registers need to be programmed. With this patch applied, and corresponding DT update CPU hotplug now works as expected. e.g. echo 0 > /sys/devices/system/cpu/cpu6/online echo 1 > /sys/devices/system/cpu/cpu6/online Note: to maintain compatibility with older DTs that didn't specify pmu-intr-gen phandle only a warning is issued if the syscon can't be obtained. Signed-off-by: Peter Griffin Link: https://lore.kernel.org/r/20250506-contrib-pg-cpu-hotplug-suspend2ram-fixes-v1-v4-5-9f64a2657316@linaro.org [krzk: few blank line and white-space alignment fixes from checkpatch] Signed-off-by: Krzysztof Kozlowski --- include/linux/soc/samsung/exynos-regs-pmu.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index ce1a3790d6fb..0d5a17ea8fb8 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -658,9 +658,20 @@ #define EXYNOS5433_PAD_RETENTION_FSYSGENIO_OPTION (0x32A8) /* For Tensor GS101 */ +/* PMU ALIVE */ #define GS101_SYSIP_DAT0 (0x810) +#define GS101_CPU0_INFORM (0x860) +#define GS101_CPU_INFORM(cpu) \ + (GS101_CPU0_INFORM + (cpu*4)) #define GS101_SYSTEM_CONFIGURATION (0x3A00) #define GS101_PHY_CTRL_USB20 (0x3EB0) #define GS101_PHY_CTRL_USBDP (0x3EB4) +/* PMU INTR GEN */ +#define GS101_GRP1_INTR_BID_UPEND (0x0108) +#define GS101_GRP1_INTR_BID_CLEAR (0x010c) +#define GS101_GRP2_INTR_BID_ENABLE (0x0200) +#define GS101_GRP2_INTR_BID_UPEND (0x0208) +#define GS101_GRP2_INTR_BID_CLEAR (0x020c) + #endif /* __LINUX_SOC_EXYNOS_REGS_PMU_H */ -- cgit v1.2.3 From 8c619eb21b8e87ae95877e9cca9fcb0e3115776e Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Sun, 4 May 2025 10:14:34 +0200 Subject: net: phy: micrel: remove KSZ9477 EEE quirks now handled by phylink The KSZ9477 PHY driver contained workarounds for broken EEE capability advertisements by manually masking supported EEE modes and forcibly disabling EEE if MICREL_NO_EEE was set. With proper MAC-side EEE handling implemented via phylink, these quirks are no longer necessary. Remove MICREL_NO_EEE handling and the use of ksz9477_get_features(). This simplifies the PHY driver and avoids duplicated EEE management logic. Signed-off-by: Oleksij Rempel Cc: stable@vger.kernel.org # v6.14+ Link: https://patch.msgid.link/20250504081434.424489-3-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- include/linux/micrel_phy.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 591bf5b5e8dc..9af01bdd86d2 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -44,7 +44,6 @@ #define MICREL_PHY_50MHZ_CLK BIT(0) #define MICREL_PHY_FXEN BIT(1) #define MICREL_KSZ8_P1_ERRATA BIT(2) -#define MICREL_NO_EEE BIT(3) #define MICREL_KSZ9021_EXTREG_CTRL 0xB #define MICREL_KSZ9021_EXTREG_DATA_WRITE 0xC -- cgit v1.2.3 From e9f3d61db5cb29b3f17f0dc40c3ec2cda2ee93e5 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:22 +0000 Subject: net: add get_netmem/put_netmem support Currently net_iovs support only pp ref counts, and do not support a page ref equivalent. This is fine for the RX path as net_iovs are used exclusively with the pp and only pp refcounting is needed there. The TX path however does not use pp ref counts, thus, support for get_page/put_page equivalent is needed for netmem. Support get_netmem/put_netmem. Check the type of the netmem before passing it to page or net_iov specific code to obtain a page ref equivalent. For dmabuf net_iovs, we obtain a ref on the underlying binding. This ensures the entire binding doesn't disappear until all the net_iovs have been put_netmem'ed. We do not need to track the refcount of individual dmabuf net_iovs as we don't allocate/free them from a pool similar to what the buddy allocator does for pages. This code is written to be extensible by other net_iov implementers. get_netmem/put_netmem will check the type of the netmem and route it to the correct helper: pages -> [get|put]_page() dmabuf net_iovs -> net_devmem_[get|put]_net_iov() new net_iovs -> new helpers Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-3-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/linux/skbuff_ref.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h index 0f3c58007488..9e49372ef1a0 100644 --- a/include/linux/skbuff_ref.h +++ b/include/linux/skbuff_ref.h @@ -17,7 +17,7 @@ */ static inline void __skb_frag_ref(skb_frag_t *frag) { - get_page(skb_frag_page(frag)); + get_netmem(skb_frag_netmem(frag)); } /** @@ -40,7 +40,7 @@ static inline void skb_page_unref(netmem_ref netmem, bool recycle) if (recycle && napi_pp_put_page(netmem)) return; #endif - put_page(netmem_to_page(netmem)); + put_netmem(netmem); } /** -- cgit v1.2.3 From bd61848900bff597764238f3a8ec67c815cd316e Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:24 +0000 Subject: net: devmem: Implement TX path Augment dmabuf binding to be able to handle TX. Additional to all the RX binding, we also create tx_vec needed for the TX path. Provide API for sendmsg to be able to send dmabufs bound to this device: - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from. - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf. Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY implementation, while disabling instances where MSG_ZEROCOPY falls back to copying. We additionally pipe the binding down to the new zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems instead of the traditional page netmems. We also special case skb_frag_dma_map to return the dma-address of these dmabuf net_iovs instead of attempting to map pages. The TX path may release the dmabuf in a context where we cannot wait. This happens when the user unbinds a TX dmabuf while there are still references to its netmems in the TX path. In that case, the netmems will be put_netmem'd from a context where we can't unmap the dmabuf, Resolve this by making __net_devmem_dmabuf_binding_free schedule_work'd. Based on work by Stanislav Fomichev . A lot of the meat of the implementation came from devmem TCP RFC v1[1], which included the TX path, but Stan did all the rebasing on top of netmem/net_iov. Cc: Stanislav Fomichev Signed-off-by: Kaiyuan Zhang Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-5-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f3e72be6f634..c7397b17bb08 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1707,13 +1707,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); + struct ubuf_info *uarg, bool devmem); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); +struct net_devmem_dmabuf_binding; + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length); + size_t length, + struct net_devmem_dmabuf_binding *binding); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length); @@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { - return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len, + NULL); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg); + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) @@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev, size_t offset, size_t size, enum dma_data_direction dir) { + if (skb_frag_is_net_iov(frag)) { + return netmem_to_net_iov(frag->netmem)->dma_addr + offset + + frag->offset; + } return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } -- cgit v1.2.3 From 383faec0fd64b9bff15eb5f700f023ec35520a96 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:26 +0000 Subject: net: enable driver support for netmem TX Drivers need to make sure not to pass netmem dma-addrs to the dma-mapping API in order to support netmem TX. Add helpers and netmem_dma_*() helpers that enables special handling of netmem dma-addrs that drivers can use. Document in netmem.rst what drivers need to do to support netmem TX. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-7-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 773167508c82..32a1e41636a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1772,6 +1772,7 @@ enum netdev_reg_state { * @lltx: device supports lockless Tx. Deprecated for real HW * drivers. Mainly used by logical interfaces, such as * bonding and tunnels + * @netmem_tx: device support netmem_tx. * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name @@ -2087,6 +2088,7 @@ struct net_device { struct_group(priv_flags_fast, unsigned long priv_flags:32; unsigned long lltx:1; + unsigned long netmem_tx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; -- cgit v1.2.3 From 4f8ceb0302b36c5f78bcc8d0e7cfa2372fba134c Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Tue, 29 Apr 2025 14:51:28 +0200 Subject: mfd: stm32-lptimer: Add support for stm32mp25 Add support for STM32MP25 SoC. A new hardware configuration register (HWCFGR2) has been added, to gather number of capture/compare channels, autonomous mode and input capture capability. The full feature set is implemented in LPTIM1/2/3/4. LPTIM5 supports a smaller set of features. This can now be read from HWCFGR registers. Add new registers to the stm32-lptimer.h: CCMR1, CCR2, HWCFGR1/2 and VERR. Update the stm32_lptimer data struct so signal the number of capture/compare channels to the child devices. Also Remove some unused bit masks (CMPOK_ARROK / CMPOKCF_ARROKCF). Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/r/20250429125133.1574167-3-fabrice.gasnier@foss.st.com Signed-off-by: Lee Jones --- include/linux/mfd/stm32-lptimer.h | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/stm32-lptimer.h b/include/linux/mfd/stm32-lptimer.h index 06d3f11dc3c9..a592c8dc716d 100644 --- a/include/linux/mfd/stm32-lptimer.h +++ b/include/linux/mfd/stm32-lptimer.h @@ -17,20 +17,30 @@ #define STM32_LPTIM_IER 0x08 /* Interrupt Enable Reg */ #define STM32_LPTIM_CFGR 0x0C /* Configuration Reg */ #define STM32_LPTIM_CR 0x10 /* Control Reg */ -#define STM32_LPTIM_CMP 0x14 /* Compare Reg */ +#define STM32_LPTIM_CMP 0x14 /* Compare Reg (MP25 CCR1) */ #define STM32_LPTIM_ARR 0x18 /* Autoreload Reg */ #define STM32_LPTIM_CNT 0x1C /* Counter Reg */ +#define STM32_LPTIM_CCMR1 0x2C /* Capture/Compare Mode MP25 */ +#define STM32_LPTIM_CCR2 0x34 /* Compare Reg2 MP25 */ + +#define STM32_LPTIM_HWCFGR2 0x3EC /* Hardware configuration register 2 - MP25 */ +#define STM32_LPTIM_HWCFGR1 0x3F0 /* Hardware configuration register 1 - MP15 */ +#define STM32_LPTIM_VERR 0x3F4 /* Version identification register - MP15 */ /* STM32_LPTIM_ISR - bit fields */ +#define STM32_LPTIM_DIEROK_ARROK (BIT(24) | BIT(4)) /* MP25 */ +#define STM32_LPTIM_CMP2_ARROK (BIT(19) | BIT(4)) #define STM32_LPTIM_CMPOK_ARROK GENMASK(4, 3) #define STM32_LPTIM_ARROK BIT(4) #define STM32_LPTIM_CMPOK BIT(3) /* STM32_LPTIM_ICR - bit fields */ -#define STM32_LPTIM_ARRMCF BIT(1) +#define STM32_LPTIM_DIEROKCF_ARROKCF (BIT(24) | BIT(4)) /* MP25 */ +#define STM32_LPTIM_CMP2OKCF_ARROKCF (BIT(19) | BIT(4)) #define STM32_LPTIM_CMPOKCF_ARROKCF GENMASK(4, 3) +#define STM32_LPTIM_ARRMCF BIT(1) -/* STM32_LPTIM_IER - bit flieds */ +/* STM32_LPTIM_IER - bit fields */ #define STM32_LPTIM_ARRMIE BIT(1) /* STM32_LPTIM_CR - bit fields */ @@ -53,16 +63,37 @@ /* STM32_LPTIM_ARR */ #define STM32_LPTIM_MAX_ARR 0xFFFF +/* STM32_LPTIM_CCMR1 */ +#define STM32_LPTIM_CC2P GENMASK(19, 18) +#define STM32_LPTIM_CC2E BIT(17) +#define STM32_LPTIM_CC2SEL BIT(16) +#define STM32_LPTIM_CC1P GENMASK(3, 2) +#define STM32_LPTIM_CC1E BIT(1) +#define STM32_LPTIM_CC1SEL BIT(0) + +/* STM32_LPTIM_HWCFGR1 */ +#define STM32_LPTIM_HWCFGR1_ENCODER BIT(16) + +/* STM32_LPTIM_HWCFGR2 */ +#define STM32_LPTIM_HWCFGR2_CHAN_NUM GENMASK(3, 0) + +/* STM32_LPTIM_VERR */ +#define STM32_LPTIM_VERR_23 0x23 /* STM32MP25 */ + /** * struct stm32_lptimer - STM32 Low-Power Timer data assigned by parent device * @clk: clock reference for this instance * @regmap: register map reference for this instance * @has_encoder: indicates this Low-Power Timer supports encoder mode + * @num_cc_chans: indicates the number of capture/compare channels + * @version: indicates the major and minor revision of the controller */ struct stm32_lptimer { struct clk *clk; struct regmap *regmap; bool has_encoder; + unsigned int num_cc_chans; + u32 version; }; #endif -- cgit v1.2.3 From 34a364ff04e960a4d47f558acf7fbafcc3085c1f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 9 May 2025 15:02:27 +0200 Subject: PM: sleep: Introduce pm_suspend_in_progress() Introduce pm_suspend_in_progress() to be used for checking if a system- wide suspend or resume transition is in progress, instead of comparing pm_suspend_target_state directly to PM_SUSPEND_ON, and use it where applicable. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Rodrigo Vivi Acked-by: Rodrigo Vivi Reviewed-by: Raag Jadav Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/2020901.PYKUYFuaPT@rjwysocki.net --- include/linux/suspend.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index da6ebca3ff77..52ea108f9451 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -298,6 +298,11 @@ static inline void s2idle_set_ops(const struct platform_s2idle_ops *ops) {} static inline void s2idle_wake(void) {} #endif /* !CONFIG_SUSPEND */ +static inline bool pm_suspend_in_progress(void) +{ + return pm_suspend_target_state != PM_SUSPEND_ON; +} + /* struct pbe is used for creating lists of pages that should be restored * atomically during the resume from disk, because the page frames they have * occupied before the suspend are in use. -- cgit v1.2.3 From 4a6b1cf0d4c02d6da2976c6314c264d20672937e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 May 2025 22:41:21 +0200 Subject: PM: EM: Introduce em_adjust_cpu_capacity() Add a function for updating the Energy Model for a CPU after its capacity has changed, which subsequently will be used by the intel_pstate driver. An EM_PERF_DOMAIN_ARTIFICIAL check is added to em_recalc_and_update() to prevent it from calling em_compute_costs() for an "artificial" perf domain with a NULL cb parameter which would cause it to crash. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Tested-by: Christian Loehle Reviewed-by: Dietmar Eggemann Link: https://patch.msgid.link/3637203.iIbC2pHGDl@rjwysocki.net --- include/linux/energy_model.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index d8eabbf86a5b..7fa1eb3cc823 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -179,6 +179,7 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int em_dev_update_chip_binning(struct device *dev); int em_update_performance_limits(struct em_perf_domain *pd, unsigned long freq_min_khz, unsigned long freq_max_khz); +void em_adjust_cpu_capacity(unsigned int cpu); void em_rebuild_sched_domains(void); /** @@ -403,6 +404,7 @@ int em_update_performance_limits(struct em_perf_domain *pd, { return -EINVAL; } +static inline void em_adjust_cpu_capacity(unsigned int cpu) {} static inline void em_rebuild_sched_domains(void) {} #endif -- cgit v1.2.3 From a98e892c694284256296465a78d11072e2c5419f Mon Sep 17 00:00:00 2001 From: Werner Sembach Date: Tue, 11 Feb 2025 14:39:05 +0100 Subject: HID: core: Add functions for HID drivers to react on first open and last close call Adds a new function to the hid_driver struct that is called when the userspace starts using the device, and another one that is called when userspace stop using the device. With this a hid driver can implement special suspend handling for devices currently not in use. Signed-off-by: Werner Sembach Link: https://patch.msgid.link/20250211133950.422232-1-wse@tuxedocomputers.com Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index a1305210b2fd..568a9d8c749b 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -795,6 +795,8 @@ struct hid_usage_id { * @suspend: invoked on suspend (NULL means nop) * @resume: invoked on resume if device was not reset (NULL means nop) * @reset_resume: invoked on resume if device was reset (NULL means nop) + * @on_hid_hw_open: invoked when hid core opens first instance (NULL means nop) + * @on_hid_hw_close: invoked when hid core closes last instance (NULL means nop) * * probe should return -errno on error, or 0 on success. During probe, * input will not be passed to raw_event unless hid_device_io_start is @@ -850,6 +852,8 @@ struct hid_driver { int (*suspend)(struct hid_device *hdev, pm_message_t message); int (*resume)(struct hid_device *hdev); int (*reset_resume)(struct hid_device *hdev); + void (*on_hid_hw_open)(struct hid_device *hdev); + void (*on_hid_hw_close)(struct hid_device *hdev); /* private: */ struct device_driver driver; -- cgit v1.2.3 From 6c58d2791d6046727d87db50a5e46644f195dcf9 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 10 Apr 2025 17:24:16 +0800 Subject: tick/nohz: Remove unused tick_nohz_full_add_cpus_to() This function isn't used anywhere. Remove it. Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250410092423.9831-1-alexs@kernel.org --- include/linux/tick.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index b8ddc8e631a3..ac76ae9fa36d 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -195,12 +195,6 @@ static inline bool tick_nohz_full_enabled(void) __ret; \ }) -static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) -{ - if (tick_nohz_full_enabled()) - cpumask_or(mask, mask, tick_nohz_full_mask); -} - extern void tick_nohz_dep_set(enum tick_dep_bits bit); extern void tick_nohz_dep_clear(enum tick_dep_bits bit); extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); @@ -281,7 +275,6 @@ extern void __init tick_nohz_full_setup(cpumask_var_t cpumask); #else static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } -static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } static inline void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { } static inline void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { } -- cgit v1.2.3 From 895ee6a22e3195b7c1fee140c842bdeedb89ed33 Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Fri, 9 May 2025 12:20:08 -0400 Subject: topology: make for_each_node_with_cpus() O(N) for_each_node_with_cpus() calls nr_cpus_node() at every iteration, which makes it O(N^2). Kernel tracks such nodes with N_CPU record in node_states array. Switching to it makes for_each_node_with_cpus() O(N). Andrea: Now we can include also offline nodes with CPUs assigned (assuming it's possible). If checking the online state is required, the user must use node_online() within the loop. CC: Andrea Righi CC:Tejun Heo Signed-off-by: Yury Norov [NVIDIA] --- include/linux/nodemask.h | 1 + include/linux/topology.h | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 0aa6b63aa747..f08ae71585fa 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -522,6 +522,7 @@ static __always_inline int node_random(const nodemask_t *maskp) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) +#define for_each_node_with_cpus(node) for_each_node_state(node, N_CPU) /* * For nodemask scratch area. diff --git a/include/linux/topology.h b/include/linux/topology.h index 24e715f0f6d2..ffee6b4a071a 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -39,10 +40,6 @@ #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif -#define for_each_node_with_cpus(node) \ - for_each_online_node(node) \ - if (nr_cpus_node(node)) - int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ -- cgit v1.2.3 From c4da7bf54b1f76e7c5c8cc6d1c4db8b19af67c5d Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Tue, 6 May 2025 10:09:31 +0800 Subject: blk-throttle: Introduce flag "BIO_TG_BPS_THROTTLED" Subsequent patches will split the single queue into separate bps and iops queues. To prevent IO that has already passed through the bps queue at a single tg level from being counted toward bps wait time again, we introduce "BIO_TG_BPS_THROTTLED" flag. Since throttle and QoS operate at different levels, we reuse the value as "BIO_QOS_THROTTLED". We set this flag when charge bps and clear it when charge iops, as the bio will move to the upper-level tg or be dispatched. This patch does not involve functional changes. Signed-off-by: Zizhi Wo Reviewed-by: Yu Kuai Signed-off-by: Zizhi Wo Link: https://lore.kernel.org/r/20250506020935.655574-5-wozizhi@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f38425338c3f..3d1577f07c1c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -296,6 +296,14 @@ enum { * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ + /* + * This bio has completed bps throttling at the single tg granularity, + * which is different from BIO_BPS_THROTTLED. When the bio is enqueued + * into the sq->queued of the upper tg, or is about to be dispatched, + * this flag needs to be cleared. Since blk-throttle and rq_qos are not + * on the same hierarchical level, reuse the value. + */ + BIO_TG_BPS_THROTTLED = BIO_QOS_THROTTLED, BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ -- cgit v1.2.3 From f5c0ecf196aaf78777f1606f1e0392c5e57c4530 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 9 May 2025 15:03:35 +0200 Subject: PM: sleep: Introduce pm_sleep_transition_in_progress() The "suspend in progress" check in device_wakeup_enable() does not cover hibernation, but arguably it should do that, so introduce pm_sleep_transition_in_progress() covering transitions during both system suspend and hibernation to use in there and use it also in pm_debug_messages_should_print(). Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/7820474.EvYhyI6sBW@rjwysocki.net [ rjw: Move the new function definition under CONFIG_PM_SLEEP ] Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 52ea108f9451..b1c76c8f2c82 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -475,6 +475,8 @@ extern void pm_print_active_wakeup_sources(void); extern unsigned int lock_system_sleep(void); extern void unlock_system_sleep(unsigned int); +extern bool pm_sleep_transition_in_progress(void); + #else /* !CONFIG_PM_SLEEP */ static inline int register_pm_notifier(struct notifier_block *nb) @@ -503,6 +505,8 @@ static inline void pm_system_irq_wakeup(unsigned int irq_number) {} static inline unsigned int lock_system_sleep(void) { return 0; } static inline void unlock_system_sleep(unsigned int flags) {} +static inline bool pm_sleep_transition_in_progress(void) { return false; } + #endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG -- cgit v1.2.3 From c84bf6dd2b836b49bb2662668ff1692350d28236 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 9 May 2025 13:13:34 +0100 Subject: mm: introduce new .mmap_prepare() file callback Patch series "eliminate mmap() retry merge, add .mmap_prepare hook", v2. During the mmap() of a file-backed mapping, we invoke the underlying driver file's mmap() callback in order to perform driver/file system initialisation of the underlying VMA. This has been a source of issues in the past, including a significant security concern relating to unwinding of error state discovered by Jann Horn, as fixed in commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") which performed the recent, significant, rework of mmap() as a whole. However, we have had a fly in the ointment remain - drivers have a great deal of freedom in the .mmap() hook to manipulate VMA state (as well as page table state). This can be problematic, as we can no longer reason sensibly about VMA state once the call is complete (the ability to do - anything - here does rather interfere with that). In addition, callers may choose to do odd or unusual things which might interfere with subsequent steps in the mmap() process, and it may do so and then raise an error, requiring very careful unwinding of state about which we can make no assumptions. Rather than providing such an open-ended interface, this series provides an alternative, far more restrictive one - we expose a whitelist of fields which can be adjusted by the driver, along with immutable state upon which the driver can make such decisions: struct vm_area_desc { /* Immutable state. */ struct mm_struct *mm; unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *file; vm_flags_t vm_flags; pgprot_t page_prot; /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; }; The mmap logic then updates the state used to either merge with a VMA or establish a new VMA based upon this logic. This is achieved via new file hook .mmap_prepare(), which is, importantly, invoked very early on in the mmap() process. If an error arises, we can very simply abort the operation with very little unwinding of state required. The existing logic contains another, related, peccadillo - since the .mmap() callback might do anything, it may also cause a previously unmergeable VMA to become mergeable with adjacent VMAs. Right now the logic will retry a merge like this only if the driver changes VMA flags, and changes them in such a way that a merge might succeed (that is, the flags are not 'special', that is do not contain any of the flags specified in VM_SPECIAL). This has also been the source of a great deal of pain - it's hard to reason about an .mmap() callback that might do - anything - but it's also hard to reason about setting up a VMA and writing to the maple tree, only to do it again utilising a great deal of shared state. Since .mmap_prepare() sets fields before the first merge is even attempted, the use of this callback obviates the need for this retry merge logic. A driver may only specify .mmap_prepare() or the deprecated .mmap() callback. In future we may add futher callbacks beyond .mmap_prepare() to faciliate all use cass as we convert drivers. In researching this change, I examined every .mmap() callback, and discovered only a very few that set VMA state in such a way that a. the VMA flags changed and b. this would be mergeable. In the majority of cases, it turns out that drivers are mapping kernel memory and thus ultimately set VM_PFNMAP, VM_MIXEDMAP, or other unmergeable VM_SPECIAL flags. Of those that remain I identified a number of cases which are only applicable in DAX, setting the VM_HUGEPAGE flag: * dax_mmap() * erofs_file_mmap() * ext4_file_mmap() * xfs_file_mmap() For this remerge to not occur and to impact users, each of these cases would require a user to mmap() files using DAX, in parts, immediately adjacent to one another. This is a very unlikely usecase and so it does not appear to be worthwhile to adjust this functionality accordingly. We can, however, very quickly do so if needed by simply adding an .mmap_prepare() callback to these as required. There are two further non-DAX cases I idenitfied: * orangefs_file_mmap() - Clears VM_RAND_READ if set, replacing with VM_SEQ_READ. * usb_stream_hwdep_mmap() - Sets VM_DONTDUMP. Both of these cases again seem very unlikely to be mmap()'d immediately adjacent to one another in a fashion that would result in a merge. Finally, we are left with a viable case: * secretmem_mmap() - Set VM_LOCKED, VM_DONTDUMP. This is viable enough that the mm selftests trigger the logic as a matter of course. Therefore, this series replace the .secretmem_mmap() hook with .secret_mmap_prepare(). This patch (of 3): Provide a means by which drivers can specify which fields of those permitted to be changed should be altered to prior to mmap()'ing a range (which may either result from a merge or from mapping an entirely new VMA). Doing so is substantially safer than the existing .mmap() calback which provides unrestricted access to the part-constructed VMA and permits drivers and file systems to do 'creative' things which makes it hard to reason about the state of the VMA after the function returns. The existing .mmap() callback's freedom has caused a great deal of issues, especially in error handling, as unwinding the mmap() state has proven to be non-trivial and caused significant issues in the past, for instance those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour"). It also necessitates a second attempt at merge once the .mmap() callback has completed, which has caused issues in the past, is awkward, adds overhead and is difficult to reason about. The .mmap_prepare() callback eliminates this requirement, as we can update fields prior to even attempting the first merge. It is safer, as we heavily restrict what can actually be modified, and being invoked very early in the mmap() process, error handling can be performed safely with very little unwinding of state required. The .mmap_prepare() and deprecated .mmap() callbacks are mutually exclusive, so we permit only one to be invoked at a time. Update vma userland test stubs to account for changes. Link: https://lkml.kernel.org/r/cover.1746792520.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/adb36a7c4affd7393b2fc4b54cc5cfe211e41f71.1746792520.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: Jann Horn Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/fs.h | 25 +++++++++++++++++++++++++ include/linux/mm_types.h | 24 ++++++++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..e2721a1ff13d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2169,6 +2169,7 @@ struct file_operations { int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); + int (*mmap_prepare)(struct vm_area_desc *); } __randomize_layout; /* Supports async buffered reads */ @@ -2238,11 +2239,35 @@ struct inode_operations { struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; +/* Did the driver provide valid mmap hook configuration? */ +static inline bool file_has_valid_mmap_hooks(struct file *file) +{ + bool has_mmap = file->f_op->mmap; + bool has_mmap_prepare = file->f_op->mmap_prepare; + + /* Hooks are mutually exclusive. */ + if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) + return false; + if (WARN_ON_ONCE(!has_mmap && !has_mmap_prepare)) + return false; + + return true; +} + static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { + if (WARN_ON_ONCE(file->f_op->mmap_prepare)) + return -EINVAL; + return file->f_op->mmap(file, vma); } +static inline int __call_mmap_prepare(struct file *file, + struct vm_area_desc *desc) +{ + return file->f_op->mmap_prepare(desc); +} + extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e76bade9ebb1..15808cad2bc1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -763,6 +763,30 @@ struct vma_numab_state { int prev_scan_seq; }; +/* + * Describes a VMA that is about to be mmap()'ed. Drivers may choose to + * manipulate mutable fields which will cause those fields to be updated in the + * resultant VMA. + * + * Helper functions are not required for manipulating any field. + */ +struct vm_area_desc { + /* Immutable state. */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + + /* Mutable fields. Populated with initial state. */ + pgoff_t pgoff; + struct file *file; + vm_flags_t vm_flags; + pgprot_t page_prot; + + /* Write-only fields. */ + const struct vm_operations_struct *vm_ops; + void *private_data; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory -- cgit v1.2.3 From 0cad6736f4b9bbe8129f367ed5818fa4aef6c2b5 Mon Sep 17 00:00:00 2001 From: Feng Lee <379943137@qq.com> Date: Fri, 9 May 2025 14:32:30 +0800 Subject: mm: remove obsolete pgd_offset_gate() Remove pgd_offset_gate() completely and simply make the single caller use pgd_offset(). It appears that the gate area resides in the kernel-mapped segment exclusively on IA64. Therefore, removing pgd_offset_k is safe since IA64 is now obsolete. Link: https://lkml.kernel.org/r/tencent_503130C3CD56569191396268CF4D12F09A06@qq.com Signed-off-by: Feng Lee <379943137@qq.com> Reviewed-by: Barry Song Acked-by: David Hildenbrand Cc: Anshuman Khandual Cc: bibo mao Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: John Hubbard Cc: Lance Yang Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index b50447ef1c92..f1e890b60460 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1164,10 +1164,6 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) } #endif -#ifndef __HAVE_ARCH_PGD_OFFSET_GATE -#define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) -#endif - #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, old_addr, new_addr) (pte) #endif -- cgit v1.2.3 From dc75c3ced10c611f524e9e444303a0249ce32e43 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 12 May 2025 22:20:59 +0200 Subject: net: phy: remove stub for mdiobus_register_board_info The functionality of mdiobus_register_board_info() typically isn't optional for the caller. Therefore remove the stub. Note: Currently we have only one caller of mdiobus_register_board_info(), in a DSA/PHYLINK context. Therefore CONFIG_MDIO_DEVICE is selected anyway. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/410a2222-c4e8-45b0-9091-d49674caeb00@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index d62d292024bc..7c29d346d4b3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2071,17 +2071,8 @@ struct mdio_board_info { const void *platform_data; }; -#if IS_ENABLED(CONFIG_MDIO_DEVICE) int mdiobus_register_board_info(const struct mdio_board_info *info, unsigned int n); -#else -static inline int mdiobus_register_board_info(const struct mdio_board_info *i, - unsigned int n) -{ - return 0; -} -#endif - /** * phy_module_driver() - Helper macro for registering PHY drivers -- cgit v1.2.3 From bc049387b41f41bee61e8cc338a5e99ca9798a09 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 13 May 2025 07:28:12 -0700 Subject: bpf: Add support for __prog argument suffix to pass in prog->aux Instead of hardcoding the list of kfuncs that need prog->aux passed to them with a combination of fixup_kfunc_call adjustment + __ign suffix, combine both in __prog suffix, which ignores the argument passed in, and fixes it up to the prog->aux. This allows kfuncs to have the prog->aux passed into them without having to touch the verifier. Cc: Tejun Heo Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250513142812.1021591-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9734544b6957..cedd66867ecf 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -591,6 +591,7 @@ struct bpf_insn_aux_data { * bpf_fastcall pattern. */ u8 fastcall_spills_num:3; + u8 arg_prog:4; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ -- cgit v1.2.3 From 3937f6db6e932c560a0f9ee2cd2a4fdcc314dadf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 19:21:15 -0700 Subject: lib/crc16: unexport crc16_table and crc16_byte() Now that neither crc16_table nor crc16_byte() is used outside lib/crc16.c, fold them into lib/crc16.c. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250513022115.39109-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc16.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc16.h b/include/linux/crc16.h index 9fa74529b317..b861d969b161 100644 --- a/include/linux/crc16.h +++ b/include/linux/crc16.h @@ -15,14 +15,7 @@ #include -extern u16 const crc16_table[256]; - -extern u16 crc16(u16 crc, const u8 *buffer, size_t len); - -static inline u16 crc16_byte(u16 crc, const u8 data) -{ - return (crc >> 8) ^ crc16_table[(crc ^ data) & 0xff]; -} +u16 crc16(u16 crc, const u8 *p, size_t len); #endif /* __CRC16_H */ -- cgit v1.2.3 From 439d47608bb3e5f7b5f5eb55c568576b60731c4d Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 12 May 2025 10:44:00 +0200 Subject: ata: libata: Print if port is external on boot Commit affccb16c117 ("ata: ahci: print the lpm policy on boot") added a lpm-pol print during boot, which shows the LPM policy used by each port. While the LPM policy is usually determined by the Kconfig CONFIG_SATA_MOBILE_LPM_POLICY, the Kconfig value is overridden e.g. if firmware has marked the port as hotplug capable / external. Commit f97106b10d9a ("ata: ahci: Add debug print for external port") did add a debug print to show if LPM was disabled because firmware has marked the port as external, however, because devices having broken LPM (even though they claim to support it) is more common than one would have hoped, print "ext" during boot if firmware has marked the port is external. This will make it easier to debug certain LPM issues, e.g. if firmware has enabled/marked only some of the ports as hotplug capable / external. Before (port marked as external by firmware): ata1: SATA max UDMA/133 abar m4096@0xfebd3000 port 0xfebd3100 irq 57 lpm-pol 0 After (port marked as external by firmware): ata1: SATA max UDMA/133 abar m4096@0xfebd3000 port 0xfebd3100 irq 57 lpm-pol 0 ext Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 98affa1d9136..31be45fd47a6 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1618,6 +1618,8 @@ static inline void ata_port_desc_misc(struct ata_port *ap, int irq) { ata_port_desc(ap, "irq %d", irq); ata_port_desc(ap, "lpm-pol %d", ap->target_lpm_policy); + if (ap->pflags & ATA_PFLAG_EXTERNAL) + ata_port_desc(ap, "ext"); } static inline bool ata_tag_internal(unsigned int tag) -- cgit v1.2.3 From 6a09ae8281986d5fb5ce0115135c05a5afb8718e Mon Sep 17 00:00:00 2001 From: Richard Leitner Date: Wed, 7 May 2025 09:51:31 +0200 Subject: leds: flash: Add support for flash/strobe duration Add support for the new V4L2_CID_FLASH_DURATION control to the LEDs driver. Signed-off-by: Richard Leitner Link: https://lore.kernel.org/r/20250507-ov9282-flash-strobe-v4-2-72b299c1b7c9@linux.dev Signed-off-by: Lee Jones --- include/linux/led-class-flash.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h index 36df927ec4b7..21ec856c36bc 100644 --- a/include/linux/led-class-flash.h +++ b/include/linux/led-class-flash.h @@ -45,6 +45,8 @@ struct led_flash_ops { int (*timeout_set)(struct led_classdev_flash *fled_cdev, u32 timeout); /* get the flash LED fault */ int (*fault_get)(struct led_classdev_flash *fled_cdev, u32 *fault); + /* set flash duration */ + int (*duration_set)(struct led_classdev_flash *fled_cdev, u32 duration); }; /* @@ -75,6 +77,9 @@ struct led_classdev_flash { /* flash timeout value in microseconds along with its constraints */ struct led_flash_setting timeout; + /* flash timeout value in microseconds along with its constraints */ + struct led_flash_setting duration; + /* LED Flash class sysfs groups */ const struct attribute_group *sysfs_groups[LED_FLASH_SYSFS_GROUPS_SIZE]; }; @@ -209,4 +214,15 @@ int led_set_flash_timeout(struct led_classdev_flash *fled_cdev, u32 timeout); */ int led_get_flash_fault(struct led_classdev_flash *fled_cdev, u32 *fault); +/** + * led_set_flash_duration - set flash LED duration + * @fled_cdev: the flash LED to set + * @timeout: the flash duration to set it to + * + * Set the flash strobe duration. + * + * Returns: 0 on success or negative error value on failure + */ +int led_set_flash_duration(struct led_classdev_flash *fled_cdev, u32 duration); + #endif /* __LINUX_FLASH_LEDS_H_INCLUDED */ -- cgit v1.2.3 From 28026cf2dd84d961a62123b1fa941dc3c2c4a132 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 May 2025 17:31:40 +0100 Subject: genirq/msi: Add .msi_teardown() callback as the reverse of .msi_prepare() While the MSI ops do have a .msi_prepare() callback that is responsible for setting up the relevant (usually per-device) allocation, there is no callback reversing this setup. For this purpose, add .msi_teardown() callback. In order to avoid breaking the ITS driver that suffers from related issues, do not call the callback just yet. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250513163144.2215824-2-maz@kernel.org --- include/linux/msi.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 8c0ec9fc05a3..63c23003ec9b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -423,6 +423,7 @@ struct msi_domain_info; * @msi_init: Domain specific init function for MSI interrupts * @msi_free: Domain specific function to free a MSI interrupts * @msi_prepare: Prepare the allocation of the interrupts in the domain + * @msi_teardown: Reverse the effects of @msi_prepare * @prepare_desc: Optional function to prepare the allocated MSI descriptor * in the domain * @set_desc: Set the msi descriptor for an interrupt @@ -438,8 +439,9 @@ struct msi_domain_info; * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying * irqdomain. * - * @msi_check, @msi_prepare, @prepare_desc and @set_desc are callbacks used by the - * msi_domain_alloc/free_irqs*() variants. + * @msi_check, @msi_prepare, @msi_teardown, @prepare_desc and + * @set_desc are callbacks used by the msi_domain_alloc/free_irqs*() + * variants. * * @domain_alloc_irqs, @domain_free_irqs can be used to override the * default allocation/free functions (__msi_domain_alloc/free_irqs). This @@ -461,6 +463,8 @@ struct msi_domain_ops { int (*msi_prepare)(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg); + void (*msi_teardown)(struct irq_domain *domain, + msi_alloc_info_t *arg); void (*prepare_desc)(struct irq_domain *domain, msi_alloc_info_t *arg, struct msi_desc *desc); void (*set_desc)(msi_alloc_info_t *arg, @@ -489,6 +493,7 @@ struct msi_domain_ops { * @handler: Optional: associated interrupt flow handler * @handler_data: Optional: associated interrupt flow handler data * @handler_name: Optional: associated interrupt flow handler name + * @alloc_data: Optional: associated interrupt allocation data * @data: Optional: domain specific data */ struct msi_domain_info { @@ -501,6 +506,7 @@ struct msi_domain_info { irq_flow_handler_t handler; void *handler_data; const char *handler_name; + msi_alloc_info_t *alloc_data; void *data; }; -- cgit v1.2.3 From 1396e89e09f00deb816e5f4a176d71d554922292 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 May 2025 17:31:42 +0100 Subject: genirq/msi: Move prepare() call to per-device allocation The current device MSI infrastructure is subtly broken, as it will issue an .msi_prepare() callback into the MSI controller driver every time it needs to allocate an MSI. That's pretty wrong, as the contract (or unwarranted assumption, depending who you ask) between the MSI controller and the core code is that .msi_prepare() is called exactly once per device. This leads to some subtle breakage in some MSI controller drivers, as it gives the impression that there are multiple endpoints sharing a bus identifier (RID in PCI parlance, DID for GICv3+). It implies that whatever allocation the ITS driver (for example) has done on behalf of these devices cannot be undone, as there is no way to track the shared state. This is particularly bad for wire-MSI devices, for which .msi_prepare() is called for each input line. To address this issue, move the call to .msi_prepare() to take place at the point of irq domain allocation, which is the only place that makes sense. The msi_alloc_info_t structure is made part of the msi_domain_template, so that its life-cycle is that of the domain as well. Finally, the msi_info::alloc_data field is made to point at this allocation tracking structure, ensuring that it is carried around the block. This is all pretty straightforward, except for the non-device-MSI leftovers, which still have to call .msi_prepare() at the old spot. One day... Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250513163144.2215824-4-maz@kernel.org --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 63c23003ec9b..f4b94ccaf0c1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -516,12 +516,14 @@ struct msi_domain_info { * @chip: Interrupt chip for this domain * @ops: MSI domain ops * @info: MSI domain info data + * @alloc_info: MSI domain allocation data (architecture specific) */ struct msi_domain_template { char name[48]; struct irq_chip chip; struct msi_domain_ops ops; struct msi_domain_info info; + msi_alloc_info_t alloc_info; }; /* -- cgit v1.2.3 From cc52a697f87e8b2d88298827aca3f81398385572 Mon Sep 17 00:00:00 2001 From: Ivaylo Ivanov Date: Sun, 4 May 2025 17:45:27 +0300 Subject: phy: exynos5-usbdrd: support Exynos USBDRD 3.2 4nm controller Add support for the Exynos USB 3.2 DRD 4nm controller. It's used in recent 4nm SoCs like Exynos2200 and Exynos2400. This device consists of 3 underlying and independent phys: SEC link control phy, Synopsys eUSB 2.0 and Synopsys USBDP/SS combophy. Unlike older device designs, where the internal phy blocks were all IP of Samsung, Synopsys phys are present. This means that the link controller is now mapped differently to account for missing bits and registers. The Synopsys phys also have separate register bases. As there are non-SEC PHYs present now, it doesn't make much sense to implement them in this driver. They are expected to be configured by external drivers, so pass phandles to them. USBDRD3.2 link controller set up is still required beforehand. This commit adds the necessary changes for USB HS to work. USB SS and DisplayPort are out of scope in this commit and will be introduced in the future. Signed-off-by: Ivaylo Ivanov Link: https://lore.kernel.org/r/20250504144527.1723980-11-ivo.ivanov.ivanov1@gmail.com Signed-off-by: Vinod Koul --- include/linux/soc/samsung/exynos-regs-pmu.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index cde299a85384..e921e7fe9be7 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -187,6 +187,9 @@ /* Only for S5Pv210 */ #define S5PV210_EINT_WAKEUP_MASK 0xC004 +/* Only for Exynos2200 */ +#define EXYNOS2200_PHY_CTRL_USB20 0x72C + /* Only for Exynos4210 */ #define S5P_CMU_CLKSTOP_LCD1_LOWPWR 0x1154 #define S5P_CMU_RESET_LCD1_LOWPWR 0x1174 -- cgit v1.2.3 From 5fa96c83b81e50833274f3b450ee9a8c0b2172bc Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 1 Apr 2025 19:07:03 +0100 Subject: coresight: Introduce pause and resume APIs for source Introduce APIs for pausing and resuming trace source and export as GPL symbols. Signed-off-by: Leo Yan Reviewed-by: Mike Leach Reviewed-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250401180708.385396-3-leo.yan@arm.com --- include/linux/coresight.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 8abdd8b5c791..4ac65c68bbf4 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -398,6 +398,8 @@ struct coresight_ops_link { * is associated to. * @enable: enables tracing for a source. * @disable: disables tracing for a source. + * @resume_perf: resumes tracing for a source in perf session. + * @pause_perf: pauses tracing for a source in perf session. */ struct coresight_ops_source { int (*cpu_id)(struct coresight_device *csdev); @@ -405,6 +407,8 @@ struct coresight_ops_source { enum cs_mode mode, struct coresight_path *path); void (*disable)(struct coresight_device *csdev, struct perf_event *event); + int (*resume_perf)(struct coresight_device *csdev); + void (*pause_perf)(struct coresight_device *csdev); }; /** -- cgit v1.2.3 From 676e8cf70cb0533e1118e29898c9a9c33ae3a10f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 May 2025 13:36:59 +0200 Subject: sched,livepatch: Untangle cond_resched() and live-patching With the goal of deprecating / removing VOLUNTARY preempt, live-patch needs to stop relying on cond_resched() to make forward progress. Instead, rely on schedule() with TASK_FREEZABLE set. Just like live-patching, the freezer needs to be able to stop tasks in a safe / known state. [bigeasy: use likely() in __klp_sched_try_switch() and update comments] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Petr Mladek Tested-by: Petr Mladek Tested-by: Miroslav Benes Acked-by: Miroslav Benes Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20250509113659.wkP_HJ5z@linutronix.de --- include/linux/livepatch_sched.h | 14 +++++--------- include/linux/sched.h | 6 ------ 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/livepatch_sched.h b/include/linux/livepatch_sched.h index 013794fb5da0..065c185f2763 100644 --- a/include/linux/livepatch_sched.h +++ b/include/linux/livepatch_sched.h @@ -3,27 +3,23 @@ #define _LINUX_LIVEPATCH_SCHED_H_ #include -#include +#include #ifdef CONFIG_LIVEPATCH void __klp_sched_try_switch(void); -#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) - DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key); -static __always_inline void klp_sched_try_switch(void) +static __always_inline void klp_sched_try_switch(struct task_struct *curr) { - if (static_branch_unlikely(&klp_sched_try_switch_key)) + if (static_branch_unlikely(&klp_sched_try_switch_key) && + READ_ONCE(curr->__state) & TASK_FREEZABLE) __klp_sched_try_switch(); } -#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ - #else /* !CONFIG_LIVEPATCH */ -static inline void klp_sched_try_switch(void) {} -static inline void __klp_sched_try_switch(void) {} +static inline void klp_sched_try_switch(struct task_struct *curr) {} #endif /* CONFIG_LIVEPATCH */ #endif /* _LINUX_LIVEPATCH_SCHED_H_ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..b98195991031 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include @@ -2089,9 +2088,6 @@ extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -void sched_dynamic_klp_enable(void); -void sched_dynamic_klp_disable(void); - DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) @@ -2112,7 +2108,6 @@ static __always_inline int _cond_resched(void) static inline int _cond_resched(void) { - klp_sched_try_switch(); return __cond_resched(); } @@ -2122,7 +2117,6 @@ static inline int _cond_resched(void) static inline int _cond_resched(void) { - klp_sched_try_switch(); return 0; } -- cgit v1.2.3 From aab12022b076f0b385b7a9a78e1161bd2df5d1e3 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 29 Apr 2025 11:18:08 +0100 Subject: soundwire: bus: Add internal slave ID and use for IRQs Currently the SoundWire IRQ code uses the dev_num to create an IRQ mapping for each slave. However, there is an issue there, the dev_num is only allocated when the slave enumerates on the bus and enumeration may happen before or after probe of the slave driver. In the case enumeration happens after probe of the slave driver then the IRQ mapping will use dev_num before it is set. This could cause multiple slaves to use zero as their IRQ mapping. It is very desirable to have the IRQ mapped before the slave probe is called, so drivers can do resource allocation in probe as normal. To solve these issues add an internal ID created for each slave when it is probed and use that for mapping the IRQ. Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20250429101808.348462-3-ckeepax@opensource.cirrus.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 2362f621d94c..0832776262ac 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,7 @@ struct sdw_slave; #define SDW_FRAME_CTRL_BITS 48 #define SDW_MAX_DEVICES 11 +#define SDW_FW_MAX_DEVICES 16 #define SDW_MAX_PORTS 15 #define SDW_VALID_PORT_RANGE(n) ((n) < SDW_MAX_PORTS && (n) >= 1) @@ -630,6 +632,7 @@ struct sdw_slave_ops { * struct sdw_slave - SoundWire Slave * @id: MIPI device ID * @dev: Linux device + * @index: internal ID for this slave * @irq: IRQ number * @status: Status reported by the Slave * @bus: Bus handle @@ -661,6 +664,7 @@ struct sdw_slave_ops { struct sdw_slave { struct sdw_slave_id id; struct device dev; + int index; int irq; enum sdw_slave_status status; struct sdw_bus *bus; @@ -968,6 +972,7 @@ struct sdw_stream_runtime { * @md: Master device * @bus_lock_key: bus lock key associated to @bus_lock * @bus_lock: bus lock + * @slave_ida: IDA for allocating internal slave IDs * @slaves: list of Slaves on this bus * @msg_lock_key: message lock key associated to @msg_lock * @msg_lock: message lock @@ -1010,6 +1015,7 @@ struct sdw_bus { struct sdw_master_device *md; struct lock_class_key bus_lock_key; struct mutex bus_lock; + struct ida slave_ida; struct list_head slaves; struct lock_class_key msg_lock_key; struct mutex msg_lock; -- cgit v1.2.3 From e1f3f5be9e8e730290ebe6d490383af4d77f0f38 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 30 Apr 2025 15:47:13 +0800 Subject: soundwire: intel: Add awareness of ACE3+ microphone privacy ACE3 introduced microphone privacy and along this feature it adds a new register in vendor specific SHIM to control and status reporting. The control of mic privacy via the SHIM register is only to enable the interrupt generation via soundwire, but not handled by the soundwire code as the mic privacy is not a feature of the soundwire IP. On the other hand, printing the register value brings value for debugging, so add a new flag to allow this conditionally. Signed-off-by: Peter Ujfalusi Reviewed-by: Ranjani Sridharan Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20250430074714.94000-2-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw_intel.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 493d9de4e472..2af1d8174c50 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -189,6 +189,9 @@ #define SDW_SHIM3_INTEL_VS_ACTMCTL_DOAISE2 BIT(14) #define SDW_SHIM3_INTEL_VS_ACTMCTL_CLDE BIT(15) +/* ACE3+ Mic privacy control and status register */ +#define SDW_SHIM2_INTEL_VS_PVCCS 0x10 + /** * struct sdw_intel_stream_params_data: configuration passed during * the @params_stream callback, e.g. for interaction with DSP @@ -331,6 +334,7 @@ struct sdw_intel_ctx { * @shim_base: sdw shim base. * @alh_base: sdw alh base. * @ext: extended HDaudio link support + * @mic_privacy: ACE version supports microphone privacy * @hbus: hdac_bus pointer, needed for power management * @eml_lock: mutex protecting shared registers in the HDaudio multi-link * space @@ -349,6 +353,7 @@ struct sdw_intel_res { u32 shim_base; u32 alh_base; bool ext; + bool mic_privacy; struct hdac_bus *hbus; struct mutex *eml_lock; }; -- cgit v1.2.3 From 9002b75aa8e6f034ffbd1c1ccac46927a1cf0f12 Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Wed, 23 Apr 2025 15:34:19 +0100 Subject: irqchip/renesas-rzv2h: Add rzv2h_icu_register_dma_req() On the Renesas RZ/V2H(P) family of SoCs, DMAC IPs are connected to the Interrupt Control Unit (ICU). For DMA transfers, a request number must be registered with the ICU, which means that the DMAC driver has to be able to instruct the ICU driver with the registration of such id. Export rzv2h_icu_register_dma_req() so that the DMAC driver can register the DMAC request number. Signed-off-by: Fabrizio Castro Reviewed-by: Thomas Gleixner Reviewed-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250423143422.3747702-4-fabrizio.castro.jz@renesas.com Signed-off-by: Vinod Koul --- include/linux/irqchip/irq-renesas-rzv2h.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 include/linux/irqchip/irq-renesas-rzv2h.h (limited to 'include/linux') diff --git a/include/linux/irqchip/irq-renesas-rzv2h.h b/include/linux/irqchip/irq-renesas-rzv2h.h new file mode 100644 index 000000000000..618a60d2eac0 --- /dev/null +++ b/include/linux/irqchip/irq-renesas-rzv2h.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Renesas RZ/V2H(P) Interrupt Control Unit (ICU) + * + * Copyright (C) 2025 Renesas Electronics Corporation. + */ + +#ifndef __LINUX_IRQ_RENESAS_RZV2H +#define __LINUX_IRQ_RENESAS_RZV2H + +#include + +#define RZV2H_ICU_DMAC_REQ_NO_DEFAULT 0x3ff + +#ifdef CONFIG_RENESAS_RZV2H_ICU +void rzv2h_icu_register_dma_req(struct platform_device *icu_dev, u8 dmac_index, u8 dmac_channel, + u16 req_no); +#else +static inline void rzv2h_icu_register_dma_req(struct platform_device *icu_dev, u8 dmac_index, + u8 dmac_channel, u16 req_no) { } +#endif + +#endif /* __LINUX_IRQ_RENESAS_RZV2H */ -- cgit v1.2.3 From 9510b38dc0ba358c93cbf5ee7c28820afb85937b Mon Sep 17 00:00:00 2001 From: Erick Shepherd Date: Mon, 31 Mar 2025 17:13:37 -0500 Subject: mmc: Add quirk to disable DDR50 tuning Adds the MMC_QUIRK_NO_UHS_DDR50_TUNING quirk and updates mmc_execute_tuning() to return 0 if that quirk is set. This fixes an issue on certain Swissbit SD cards that do not support DDR50 tuning where tuning requests caused I/O errors to be thrown. Signed-off-by: Erick Shepherd Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20250331221337.1414534-1-erick.shepherd@ni.com Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 526fce581657..ddcdf23d731c 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -329,6 +329,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_SD_CACHE (1<<15) /* Disable broken SD cache support */ #define MMC_QUIRK_BROKEN_CACHE_FLUSH (1<<16) /* Don't flush cache until the write has occurred */ #define MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY (1<<17) /* Disable broken SD poweroff notify support */ +#define MMC_QUIRK_NO_UHS_DDR50_TUNING (1<<18) /* Disable DDR50 tuning */ bool written_flag; /* Indicates eMMC has been written since power on */ bool reenable_cmdq; /* Re-enable Command Queue */ -- cgit v1.2.3 From da012e1eb5372be518e2667c1afa00996bb076e6 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 1 Apr 2025 11:58:46 +0200 Subject: mmc: rename mmc_can_gpio_cd() to mmc_host_can_gpio_cd() mmc_can_* functions sometimes relate to the card and sometimes to the host. Make it obvious by renaming this function to include 'host'. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250401095847.29271-11-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- include/linux/mmc/slot-gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h index 1ed7b0d1e4f9..4f6a46c636ec 100644 --- a/include/linux/mmc/slot-gpio.h +++ b/include/linux/mmc/slot-gpio.h @@ -24,7 +24,7 @@ int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id, int mmc_gpiod_set_cd_config(struct mmc_host *host, unsigned long config); int mmc_gpio_set_cd_wake(struct mmc_host *host, bool on); void mmc_gpiod_request_cd_irq(struct mmc_host *host); -bool mmc_can_gpio_cd(struct mmc_host *host); +bool mmc_host_can_gpio_cd(struct mmc_host *host); bool mmc_can_gpio_ro(struct mmc_host *host); #endif -- cgit v1.2.3 From 2e1a26ed6b3864576720364c793e4abf0681fcf5 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 1 Apr 2025 11:58:47 +0200 Subject: mmc: rename mmc_can_gpio_ro() to mmc_host_can_gpio_ro() mmc_can_* functions sometimes relate to the card and sometimes to the host. Make it obvious by renaming this function to include 'host'. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250401095847.29271-12-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- include/linux/mmc/slot-gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h index 4f6a46c636ec..23ac5696fa38 100644 --- a/include/linux/mmc/slot-gpio.h +++ b/include/linux/mmc/slot-gpio.h @@ -25,6 +25,6 @@ int mmc_gpiod_set_cd_config(struct mmc_host *host, unsigned long config); int mmc_gpio_set_cd_wake(struct mmc_host *host, bool on); void mmc_gpiod_request_cd_irq(struct mmc_host *host); bool mmc_host_can_gpio_cd(struct mmc_host *host); -bool mmc_can_gpio_ro(struct mmc_host *host); +bool mmc_host_can_gpio_ro(struct mmc_host *host); #endif -- cgit v1.2.3 From ac01fa73f5309a35eff83be61442a8891159b487 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 10 May 2025 16:37:30 -0400 Subject: tracepoint: Have tracepoints created with DECLARE_TRACE() have _tp suffix Most tracepoints in the kernel are created with TRACE_EVENT(). The TRACE_EVENT() macro (and DECLARE_EVENT_CLASS() and DEFINE_EVENT() where in reality, TRACE_EVENT() is just a helper macro that calls those other two macros), will create not only a tracepoint (the function trace_() used in the kernel), it also exposes the tracepoint to user space along with defining what fields will be saved by that tracepoint. There are a few places that tracepoints are created in the kernel that are not exposed to userspace via tracefs. They can only be accessed from code within the kernel. These tracepoints are created with DEFINE_TRACE() Most of these tracepoints end with "_tp". This is useful as when the developer sees that, they know that the tracepoint is for in-kernel only (meaning it can only be accessed inside the kernel, either directly by the kernel or indirectly via modules and BPF programs) and is not exposed to user space. Instead of making this only a process to add "_tp", enforce it by making the DECLARE_TRACE() append the "_tp" suffix to the tracepoint. This requires adding DECLARE_TRACE_EVENT() macros for the TRACE_EVENT() macro to use that keeps the original name. Link: https://lore.kernel.org/all/20250418083351.20a60e64@gandalf.local.home/ Cc: netdev Cc: Jiri Olsa Cc: Peter Zijlstra Cc: David Ahern Cc: Juri Lelli Cc: Breno Leitao Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Gabriele Monaco Cc: Masami Hiramatsu Link: https://lore.kernel.org/20250510163730.092fad5b@gandalf.local.home Acked-by: Mathieu Desnoyers Acked-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index a351763e6965..826ce3f8e1f8 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -464,16 +464,30 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #endif #define DECLARE_TRACE(name, proto, args) \ - __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + __DECLARE_TRACE(name##_tp, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()), \ PARAMS(void *__data, proto)) #define DECLARE_TRACE_CONDITION(name, proto, args, cond) \ - __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + __DECLARE_TRACE(name##_tp, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ PARAMS(void *__data, proto)) #define DECLARE_TRACE_SYSCALL(name, proto, args) \ + __DECLARE_TRACE_SYSCALL(name##_tp, PARAMS(proto), PARAMS(args), \ + PARAMS(void *__data, proto)) + +#define DECLARE_TRACE_EVENT(name, proto, args) \ + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()), \ + PARAMS(void *__data, proto)) + +#define DECLARE_TRACE_EVENT_CONDITION(name, proto, args, cond) \ + __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ + cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ + PARAMS(void *__data, proto)) + +#define DECLARE_TRACE_EVENT_SYSCALL(name, proto, args) \ __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \ PARAMS(void *__data, proto)) @@ -591,32 +605,32 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print) #define DEFINE_EVENT(template, name, proto, args) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)\ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define DEFINE_EVENT_CONDITION(template, name, proto, \ args, cond) \ - DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + DECLARE_TRACE_EVENT_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT(name, proto, args, struct, assign, print) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FN(name, proto, args, struct, \ assign, print, reg, unreg) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#define TRACE_EVENT_FN_COND(name, proto, args, cond, struct, \ + DECLARE_TRACE_EVENT(name, PARAMS(proto), PARAMS(args)) +#define TRACE_EVENT_FN_COND(name, proto, args, cond, struct, \ assign, print, reg, unreg) \ - DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + DECLARE_TRACE_EVENT_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT_CONDITION(name, proto, args, cond, \ struct, assign, print) \ - DECLARE_TRACE_CONDITION(name, PARAMS(proto), \ + DECLARE_TRACE_EVENT_CONDITION(name, PARAMS(proto), \ PARAMS(args), PARAMS(cond)) #define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, \ print, reg, unreg) \ - DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args)) + DECLARE_TRACE_EVENT_SYSCALL(name, PARAMS(proto), PARAMS(args)) #define TRACE_EVENT_FLAGS(event, flag) -- cgit v1.2.3 From 289c99bec7eed918ab37c62cbb29a2e3f58fb1fb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 13 May 2025 22:24:09 -0700 Subject: lib/crc32: add SPDX license identifier lib/crc32.c and include/linux/crc32.h got missed by the bulk SPDX conversion because of the nonstandard explanation of the license. However, crc32.c clearly states that it's licensed under the GNU General Public License, Version 2. And the comment in crc32.h clearly indicates that it's meant to have the same license as crc32.c. Therefore, apply SPDX-License-Identifier: GPL-2.0-only to both files. Reviewed-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250514052409.194822-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 69c2e8bb3782..569dc13f139f 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -1,7 +1,4 @@ -/* - * crc32.h - * See linux/lib/crc32.c for license and changes - */ +/* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_CRC32_H #define _LINUX_CRC32_H -- cgit v1.2.3 From 539fbab37881e32ba6a708a100de6db19e1e7e7d Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 7 Apr 2025 15:28:05 +0300 Subject: tpm: Mask TPM RC in tpm2_start_auth_session() tpm2_start_auth_session() does not mask TPM RC correctly from the callers: [ 28.766528] tpm tpm0: A TPM error (2307) occurred start auth session Process TPM RCs inside tpm2_start_auth_session(), and map them to POSIX error codes. Cc: stable@vger.kernel.org # v6.10+ Fixes: 699e3efd6c64 ("tpm: Add HMAC session start and end functions") Reported-by: Herbert Xu Closes: https://lore.kernel.org/linux-integrity/Z_NgdRHuTKP6JK--@gondor.apana.org.au/ Reviewed-by: Stefano Garzarella Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 6c3125300c00..9ac9768cc8f7 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -257,6 +257,7 @@ enum tpm2_return_codes { TPM2_RC_TESTING = 0x090A, /* RC_WARN */ TPM2_RC_REFERENCE_H0 = 0x0910, TPM2_RC_RETRY = 0x0922, + TPM2_RC_SESSION_MEMORY = 0x0903, }; enum tpm2_command_codes { @@ -437,6 +438,24 @@ static inline u32 tpm2_rc_value(u32 rc) return (rc & BIT(7)) ? rc & 0xbf : rc; } +/* + * Convert a return value from tpm_transmit_cmd() to POSIX error code. + */ +static inline ssize_t tpm_ret_to_err(ssize_t ret) +{ + if (ret < 0) + return ret; + + switch (tpm2_rc_value(ret)) { + case TPM2_RC_SUCCESS: + return 0; + case TPM2_RC_SESSION_MEMORY: + return -ENOMEM; + default: + return -EFAULT; + } +} + #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE) extern int tpm_is_tpm2(struct tpm_chip *chip); -- cgit v1.2.3 From 2f661f71fda1fc0c42b7746ca5b7da529eb6b5be Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 4 Apr 2025 10:23:14 +0200 Subject: tpm: tis: Double the timeout B to 4s With some Infineon chips the timeouts in tpm_tis_send_data (both B and C) can reach up to about 2250 ms. Timeout C is retried since commit de9e33df7762 ("tpm, tpm_tis: Workaround failed command reception on Infineon devices") Timeout B still needs to be extended. The problem is most commonly encountered with context related operation such as load context/save context. These are issued directly by the kernel, and there is no retry logic for them. When a filesystem is set up to use the TPM for unlocking the boot fails, and restarting the userspace service is ineffective. This is likely because ignoring a load context/save context result puts the real TPM state and the TPM state expected by the kernel out of sync. Chips known to be affected: tpm_tis IFX1522:00: 2.0 TPM (device-id 0x1D, rev-id 54) Description: SLB9672 Firmware Revision: 15.22 tpm_tis MSFT0101:00: 2.0 TPM (device-id 0x1B, rev-id 22) Firmware Revision: 7.83 tpm_tis MSFT0101:00: 2.0 TPM (device-id 0x1A, rev-id 16) Firmware Revision: 5.63 Link: https://lore.kernel.org/linux-integrity/Z5pI07m0Muapyu9w@kitsune.suse.cz/ Signed-off-by: Michal Suchanek Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 9ac9768cc8f7..a3d8305e88a5 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -224,7 +224,7 @@ enum tpm2_const { enum tpm2_timeouts { TPM2_TIMEOUT_A = 750, - TPM2_TIMEOUT_B = 2000, + TPM2_TIMEOUT_B = 4000, TPM2_TIMEOUT_C = 200, TPM2_TIMEOUT_D = 30, TPM2_DURATION_SHORT = 20, -- cgit v1.2.3 From 45a442fe369e6c4e0b4aa9f63b31c3f2f9e2090e Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:04 -0700 Subject: Drivers: hv: vmbus: Remove vmbus_sendpacket_pagebuffer() With the netvsc driver changed to use vmbus_sendpacket_mpb_desc() instead of vmbus_sendpacket_pagebuffer(), the latter has no remaining callers. Remove it. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-6-mhklinux@outlook.com Signed-off-by: Jakub Kicinski --- include/linux/hyperv.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 675959fb97ba..ea806cef4a0d 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1161,13 +1161,6 @@ extern int vmbus_sendpacket(struct vmbus_channel *channel, enum vmbus_packet_type type, u32 flags); -extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, - void *buffer, - u32 bufferlen, - u64 requestid); - extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *mpb, u32 desc_size, -- cgit v1.2.3 From 8d9117009dd690f647a66912f429c96335069907 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 13 May 2025 13:23:31 +0200 Subject: fuse: don't allow signals to interrupt getdents copying When getting the directory contents, the entries are first fetched to a kernel buffer, then they are copied to userspace with dir_emit(). This second phase is non-blocking as long as the userspace buffer is not paged out, making it interruptible makes zero sense. Overload d_type as flags, since it only uses 4 bits from 32. Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/20250513112335.1473177-1-mszeredi@redhat.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index cf7208500cee..c8b5af17060e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2073,6 +2073,9 @@ struct dir_context { loff_t pos; }; +/* If OR-ed with d_type, pending signals are not checked */ +#define FILLDIR_FLAG_NOINTR 0x1000 + /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. -- cgit v1.2.3 From 132833405e61463d47d6badff1b8080b09b5808e Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 5 May 2025 19:54:39 +0530 Subject: PCI: Add debugfs support for exposing PTM context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Precision Time Management (PTM) mechanism defined in PCIe spec r6.0, sec 6.21 allows precise coordination of timing information across multiple components in a PCIe hierarchy with independent local time clocks. PCI core already supports enabling PTM in the root port and endpoint devices through PTM Extended Capability registers. But the PTM context supported by the PTM capable components such as Root Complex (RC) and Endpoint (EP) controllers were not exposed as of now. Part of the reason is that the spec doesn't define how the context information is exposed to the software and left it to the vendor implementation. So there is no standardized way to get access to the context information and each vendor have defined their own way. This commit adds debugfs support to expose the PTM context to userspace from both PCIe RC and EP controllers. Since the context information is exposed in a vendor specific way, the debugfs interface allows the controller drivers to implement callbacks for each attribute, to be called by the generic PTM driver. The Controller drivers are expected to call pcie_ptm_create_debugfs() to create the debugfs attributes for the PTM context and call pcie_ptm_destroy_debugfs() to destroy them. The drivers should also populate the relevant callbacks in the 'struct pcie_ptm_ops' structure based on the controller implementation. Below PTM context are exposed through debugfs: PCIe RC ======= 1. PTM Local clock 2. PTM T2 timestamp 3. PTM T3 timestamp 4. PTM Context valid PCIe EP ======= 1. PTM Local clock 2. PTM T1 timestamp 3. PTM T4 timestamp 4. PTM Master clock 5. PTM Context update Signed-off-by: Manivannan Sadhasivam [kwilczynski: fix overflow issue reported by Dan Carpenter from https://lore.kernel.org/linux-pci/b41c1754-c6b7-4805-9f14-7c643d6c5304@suswa.mountain] Signed-off-by: Krzysztof Wilczyński Link: https://patch.msgid.link/20250505-pcie-ptm-v4-1-02d26d51400b@linaro.org --- include/linux/pci.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..01a29076604f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1856,6 +1856,39 @@ static inline bool pci_aer_available(void) { return false; } bool pci_ats_disabled(void); +#define PCIE_PTM_CONTEXT_UPDATE_AUTO 0 +#define PCIE_PTM_CONTEXT_UPDATE_MANUAL 1 + +struct pcie_ptm_ops { + int (*check_capability)(void *drvdata); + int (*context_update_write)(void *drvdata, u8 mode); + int (*context_update_read)(void *drvdata, u8 *mode); + int (*context_valid_write)(void *drvdata, bool valid); + int (*context_valid_read)(void *drvdata, bool *valid); + int (*local_clock_read)(void *drvdata, u64 *clock); + int (*master_clock_read)(void *drvdata, u64 *clock); + int (*t1_read)(void *drvdata, u64 *clock); + int (*t2_read)(void *drvdata, u64 *clock); + int (*t3_read)(void *drvdata, u64 *clock); + int (*t4_read)(void *drvdata, u64 *clock); + + bool (*context_update_visible)(void *drvdata); + bool (*context_valid_visible)(void *drvdata); + bool (*local_clock_visible)(void *drvdata); + bool (*master_clock_visible)(void *drvdata); + bool (*t1_visible)(void *drvdata); + bool (*t2_visible)(void *drvdata); + bool (*t3_visible)(void *drvdata); + bool (*t4_visible)(void *drvdata); +}; + +struct pci_ptm_debugfs { + struct dentry *debugfs; + const struct pcie_ptm_ops *ops; + struct mutex lock; + void *pdata; +}; + #ifdef CONFIG_PCIE_PTM int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); void pci_disable_ptm(struct pci_dev *dev); @@ -1868,6 +1901,18 @@ static inline bool pcie_ptm_enabled(struct pci_dev *dev) { return false; } #endif +#if IS_ENABLED(CONFIG_DEBUG_FS) && IS_ENABLED(CONFIG_PCIE_PTM) +struct pci_ptm_debugfs *pcie_ptm_create_debugfs(struct device *dev, void *pdata, + const struct pcie_ptm_ops *ops); +void pcie_ptm_destroy_debugfs(struct pci_ptm_debugfs *ptm_debugfs); +#else +static inline struct pci_ptm_debugfs +*pcie_ptm_create_debugfs(struct device *dev, void *pdata, + const struct pcie_ptm_ops *ops) { return NULL; } +static inline void +pcie_ptm_destroy_debugfs(struct pci_ptm_debugfs *ptm_debugfs) { } +#endif + void pci_cfg_access_lock(struct pci_dev *dev); bool pci_cfg_access_trylock(struct pci_dev *dev); void pci_cfg_access_unlock(struct pci_dev *dev); -- cgit v1.2.3 From e0410e956b97e8b50b2aa7b02ba70e5f09b31ebe Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 13 May 2025 17:10:08 +0200 Subject: readdir: supply dir_context.count as readdir buffer size hint This is a preparation for large readdir buffers in fuse. Simply setting the fuse buffer size to the userspace buffer size should work, the record sizes are similar (fuse's is slightly larger than libc's, so no overflow should ever happen). Signed-off-by: Miklos Szeredi Signed-off-by: Jaco Kroon Link: https://lore.kernel.org/20250513151012.1476536-1-mszeredi@redhat.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c8b5af17060e..ddd54a664916 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2071,6 +2071,13 @@ typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, struct dir_context { filldir_t actor; loff_t pos; + /* + * Filesystems MUST NOT MODIFY count, but may use as a hint: + * 0 unknown + * > 0 space in buffer (assume at least one entry) + * INT_MAX unlimited + */ + int count; }; /* If OR-ed with d_type, pending signals are not checked */ -- cgit v1.2.3 From d8c5507cd140d9471472ece673e70250b957c595 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 13 May 2025 17:03:24 +0200 Subject: include/linux/fs.h: add inode_lock_killable() Prepare for making inode operations killable while they're waiting for the lock. Signed-off-by: Max Kellermann Link: https://lore.kernel.org/20250513150327.1373061-1-max.kellermann@ionos.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ddd54a664916..82c6093b8582 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -867,6 +867,11 @@ static inline void inode_lock(struct inode *inode) down_write(&inode->i_rwsem); } +static inline __must_check int inode_lock_killable(struct inode *inode) +{ + return down_write_killable(&inode->i_rwsem); +} + static inline void inode_unlock(struct inode *inode) { up_write(&inode->i_rwsem); @@ -877,6 +882,11 @@ static inline void inode_lock_shared(struct inode *inode) down_read(&inode->i_rwsem); } +static inline __must_check int inode_lock_shared_killable(struct inode *inode) +{ + return down_read_killable(&inode->i_rwsem); +} + static inline void inode_unlock_shared(struct inode *inode) { up_read(&inode->i_rwsem); -- cgit v1.2.3 From c16608005ccb99fbde3a4cd96eab28e16f148abf Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 13 May 2025 11:19:22 +0300 Subject: net: Look for bonding slaves in the bond's network namespace Update the for_each_netdev_in_bond_rcu macro to iterate through network devices in the bond's network namespace instead of always using init_net. This change is safe because: 1. **Bond-Slave Namespace Relationship**: A bond device and its slaves must reside in the same network namespace. The bond device's namespace is established at creation time and cannot change. 2. **Slave Movement Implications**: Any attempt to move a slave device to a different namespace automatically removes it from the bond, as per kernel networking stack rules. This maintains the invariant that slaves must exist in the same namespace as their bond. This change is part of an effort to enable Link Aggregation (LAG) to work properly inside custom network namespaces. Previously, the macro would only find slave devices in the initial network namespace, preventing proper bonding functionality in custom namespaces. Signed-off-by: Shay Drory Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250513081922.525716-1-mbloch@nvidia.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 32a1e41636a9..9e3a2d8452d6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3268,7 +3268,7 @@ int call_netdevice_notifiers_info(unsigned long val, #define for_each_netdev_continue_rcu(net, d) \ list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_in_bond_rcu(bond, slave) \ - for_each_netdev_rcu(&init_net, slave) \ + for_each_netdev_rcu(dev_net_rcu(bond), slave) \ if (netdev_master_upper_dev_get_rcu(slave) == (bond)) #define net_device_entry(lh) list_entry(lh, struct net_device, dev_list) -- cgit v1.2.3 From b9eef3391de028fdd88fd7a2f81a4834fc98c9ac Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:26 +0200 Subject: xdp: Use nested-BH locking for system_page_pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit system_page_pool is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Make a struct with a page_pool member (original system_page_pool) and a local_lock_t and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Andrew Lunn Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jesper Dangaard Brouer Cc: John Fastabend Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-6-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9e3a2d8452d6..73a97cf1bbce 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3503,7 +3503,12 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); -DECLARE_PER_CPU(struct page_pool *, system_page_pool); + +struct page_pool_bh { + struct page_pool *pool; + local_lock_t bh_lock; +}; +DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) -- cgit v1.2.3 From 7fe70c06a182a140be9996b02256d907e114479a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:31 +0200 Subject: net/sched: act_mirred: Move the recursion counter struct netdev_xmit mirred_nest_level is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move mirred_nest_level to struct netdev_xmit as u8, provide wrappers. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Juri Lelli Link: https://patch.msgid.link/20250512092736.229935-11-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice_xmit.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 38325e070296..848735b3a7c0 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -8,6 +8,9 @@ struct netdev_xmit { #ifdef CONFIG_NET_EGRESS u8 skip_txqueue; #endif +#if IS_ENABLED(CONFIG_NET_ACT_MIRRED) + u8 sched_mirred_nest; +#endif }; #endif -- cgit v1.2.3 From ce45dc4bb22e96b59a07e19b67e915d99dd5281b Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Fri, 25 Apr 2025 11:24:22 +0200 Subject: PCI: Limit visibility of match_driver flag to PCI core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 58d9a38f6fac ("PCI: Skip attaching driver in device_add()"), PCI enumeration is split into two steps: In the first step, all devices are published in sysfs with device_add(). In the second step, drivers are bound to the devices with device_attach(). To delay driver binding until the second step, a "bool match_driver" in struct pci_dev is used. Instead of a bool, use a bit in the "unsigned long priv_flags" to shrink struct pci_dev a little and prevent use of the bool outside the PCI core (as has happened with commit cbbc00be2ce3 ("iommu/amd: Prevent binding other PCI drivers to IOMMU PCI devices")). Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Signed-off-by: Krzysztof Wilczyński Link: https://patch.msgid.link/d22a9e5b81d6bd8dd1837607d6156679b3b1199c.1745572340.git.lukas@wunner.de --- include/linux/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index d26e6611bd00..053d9dd494d4 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -423,8 +423,6 @@ struct pci_dev { struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ struct resource driver_exclusive_resource; /* driver exclusive resource ranges */ - bool match_driver; /* Skip attaching driver */ - unsigned int transparent:1; /* Subtractive decode bridge */ unsigned int io_window:1; /* Bridge has I/O window */ unsigned int pref_window:1; /* Bridge has pref mem window */ -- cgit v1.2.3 From 241b9b584d50e01f7eb50c0555aa570502bcc8c5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 14 May 2025 13:50:33 -0700 Subject: dm-zone: Use bdev_*() helper functions where applicable Improve code readability by using bdev_is_zone_aligned() and bdev_offset_from_zone_start() where applicable. No functionality has been changed. This patch is a reworked version of a patch from Pankaj Raghav. See also https://lore.kernel.org/linux-block/20220923173618.6899-11-p.raghav@samsung.com/. Cc: Damien Le Moal Cc: Pankaj Raghav Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Signed-off-by: Mikulas Patocka --- include/linux/blkdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e39c45bc0a97..edc941ec8fdd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1419,6 +1419,13 @@ static inline bool bdev_is_zone_start(struct block_device *bdev, return bdev_offset_from_zone_start(bdev, sector) == 0; } +/* Check whether @sector is a multiple of the zone size. */ +static inline bool bdev_is_zone_aligned(struct block_device *bdev, + sector_t sector) +{ + return bdev_is_zone_start(bdev, sector); +} + /** * bdev_zone_is_seq - check if a sector belongs to a sequential write zone * @bdev: block device to check -- cgit v1.2.3 From 18049c8cff9cc89daadc4df6975f7d9069638926 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Thu, 8 May 2025 16:26:42 -0700 Subject: perf/aux: Allocate non-contiguous AUX pages by default perf always allocates contiguous AUX pages based on aux_watermark. However, this contiguous allocation doesn't benefit all PMUs. For instance, ARM SPE and TRBE operate with virtual pages, and Coresight ETR allocates a separate buffer. For these PMUs, allocating contiguous AUX pages unnecessarily exacerbates memory fragmentation. This fragmentation can prevent their use on long-running devices. This patch modifies the perf driver to be memory-friendly by default, by allocating non-contiguous AUX pages. For PMUs requiring contiguous pages (Intel BTS and some Intel PT), the existing PERF_PMU_CAP_AUX_NO_SG capability can be used. For PMUs that don't require but can benefit from contiguous pages (some Intel PT), a new capability, PERF_PMU_CAP_AUX_PREFER_LARGE, is added to maintain their existing behavior. Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: James Clark Reviewed-by: Anshuman Khandual Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20250508232642.148767-1-yabinc@google.com --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 947ad12dfdbe..a96c00e2ceca 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -303,6 +303,7 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 +#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 /** * pmu::scope -- cgit v1.2.3 From 189572bf4e005431051319def435ac4b7e4489ca Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Thu, 15 May 2025 16:58:31 +0000 Subject: cpumask: Relax cpumask_any_but() Similarly to other cpumask search functions, accept -1, and consider it as 'any CPU' hint. This helps users to avoid coding special cases. Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: James Morse Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Tested-by: James Morse Tested-by: Tony Luck Tested-by: Fenghua Yu Link: https://lore.kernel.org/20250515165855.31452-2-james.morse@arm.com --- include/linux/cpumask.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index f9a868384083..a3ee875df508 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -413,14 +413,18 @@ unsigned int cpumask_next_wrap(int n, const struct cpumask *src) * @cpu: the cpu to ignore. * * Often used to find any cpu but smp_processor_id() in a mask. + * If @cpu == -1, the function is equivalent to cpumask_any(). * Return: >= nr_cpu_ids if no cpus set. */ static __always_inline -unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) +unsigned int cpumask_any_but(const struct cpumask *mask, int cpu) { unsigned int i; - cpumask_check(cpu); + /* -1 is a legal arg here. */ + if (cpu != -1) + cpumask_check(cpu); + for_each_cpu(i, mask) if (i != cpu) break; @@ -433,16 +437,20 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) * @mask2: the second input cpumask * @cpu: the cpu to ignore * + * If @cpu == -1, the function is equivalent to cpumask_any_and(). * Returns >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_and_but(const struct cpumask *mask1, const struct cpumask *mask2, - unsigned int cpu) + int cpu) { unsigned int i; - cpumask_check(cpu); + /* -1 is a legal arg here. */ + if (cpu != -1) + cpumask_check(cpu); + i = cpumask_first_and(mask1, mask2); if (i != cpu) return i; -- cgit v1.2.3 From 13f0a02bf4c1c5888c736cedef9ca50de666adb3 Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Thu, 15 May 2025 16:58:32 +0000 Subject: find: Add find_first_andnot_bit() The function helps to implement cpumask_andnot() APIs. Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: James Morse Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Tested-by: James Morse Tested-by: Tony Luck Tested-by: Fenghua Yu Link: https://lore.kernel.org/20250515165855.31452-3-james.morse@arm.com --- include/linux/find.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 68685714bc18..5a2c267ea7f9 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -29,6 +29,8 @@ unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsign unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); +unsigned long _find_first_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long size); unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); @@ -347,6 +349,29 @@ unsigned long find_first_and_bit(const unsigned long *addr1, } #endif +/** + * find_first_andnot_bit - find the first bit set in 1st memory region and unset in 2nd + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * + * Returns the bit number for the first set bit + * If no bits are set, returns >= @size. + */ +static __always_inline +unsigned long find_first_andnot_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_andnot_bit(addr1, addr2, size); +} + /** * find_first_and_and_bit - find the first set bit in 3 memory regions * @addr1: The first address to base the search on -- cgit v1.2.3 From 5da703ef4e4a82b2f9a00f2b3a1eae5657e68a92 Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Thu, 15 May 2025 16:58:33 +0000 Subject: cpumask: Add cpumask_{first,next}_andnot() API With the lack of the functions, client code has to abuse less efficient cpumask_nth(). Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: James Morse Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: James Morse Tested-by: Tony Luck Link: https://lore.kernel.org/20250515165855.31452-4-james.morse@arm.com --- include/linux/cpumask.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index a3ee875df508..6a569c7534db 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -178,6 +178,19 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } +/** + * cpumask_first_andnot - return the first cpu from *srcp1 & ~*srcp2 + * @srcp1: the first input + * @srcp2: the second input + * + * Return: >= nr_cpu_ids if no such cpu found. + */ +static __always_inline +unsigned int cpumask_first_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) +{ + return find_first_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); +} + /** * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3 * @srcp1: the first input @@ -284,6 +297,25 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, small_cpumask_bits, n + 1); } +/** + * cpumask_next_andnot - get the next cpu in *src1p & ~*src2p + * @n: the cpu prior to the place to search (i.e. return will be > @n) + * @src1p: the first cpumask pointer + * @src2p: the second cpumask pointer + * + * Return: >= nr_cpu_ids if no further cpus set in both. + */ +static __always_inline +unsigned int cpumask_next_andnot(int n, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_andnot_bit(cpumask_bits(src1p), cpumask_bits(src2p), + small_cpumask_bits, n + 1); +} + /** * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from * @n+1. If nothing found, wrap around and start from @@ -458,6 +490,33 @@ unsigned int cpumask_any_and_but(const struct cpumask *mask1, return cpumask_next_and(cpu, mask1, mask2); } +/** + * cpumask_any_andnot_but - pick an arbitrary cpu from *mask1 & ~*mask2, but not this one. + * @mask1: the first input cpumask + * @mask2: the second input cpumask + * @cpu: the cpu to ignore + * + * If @cpu == -1, the function returns the first matching cpu. + * Returns >= nr_cpu_ids if no cpus set. + */ +static __always_inline +unsigned int cpumask_any_andnot_but(const struct cpumask *mask1, + const struct cpumask *mask2, + int cpu) +{ + unsigned int i; + + /* -1 is a legal arg here. */ + if (cpu != -1) + cpumask_check(cpu); + + i = cpumask_first_andnot(mask1, mask2); + if (i != cpu) + return i; + + return cpumask_next_andnot(cpu, mask1, mask2); +} + /** * cpumask_nth - get the Nth cpu in a cpumask * @srcp: the cpumask pointer -- cgit v1.2.3 From ea33537d82921e71f852ea2ed985acc562125efe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 May 2025 19:39:12 +0000 Subject: tcp: add receive queue awareness in tcp_rcv_space_adjust() If the application can not drain fast enough a TCP socket queue, tcp_rcv_space_adjust() can overestimate tp->rcvq_space.space. Then sk->sk_rcvbuf can grow and hit tcp_rmem[2] for no good reason. Fix this by taking into acount the number of available bytes. Keeping sk->sk_rcvbuf at the right size allows better cache efficiency. Signed-off-by: Eric Dumazet Reviewed-by: Wei Wang Link: https://patch.msgid.link/20250513193919.1089692-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a8af71623ba7..29f59d50dc73 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -340,7 +340,7 @@ struct tcp_sock { } rcv_rtt_est; /* Receiver queue space */ struct { - u32 space; + int space; u32 seq; u64 time; } rcvq_space; -- cgit v1.2.3 From bc740420d7ae7878696d5ad9f3dbcb11e2599b2e Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 15 May 2025 16:58:39 +0000 Subject: x86/resctrl: Drop __init/__exit on assorted symbols Because ARM's MPAM controls are probed using MMIO, resctrl can't be initialised until enough CPUs are online to have determined the system-wide supported num_closid. Arm64 also supports 'late onlined secondaries', where only a subset of CPUs are online during boot. These two combine to mean the MPAM driver may not be able to initialise resctrl until user-space has brought 'enough' CPUs online. To allow MPAM to initialise resctrl after __init text has been free'd, remove all the __init markings from resctrl. The existing __exit markings cause these functions to be removed by the linker as it has never been possible to build resctrl as a module. MPAM has an error interrupt which causes the driver to reset and disable itself. Remove the __exit markings to allow the MPAM driver to tear down resctrl when an error occurs. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Tested-by: Tony Luck Link: https://lore.kernel.org/20250515165855.31452-10-james.morse@arm.com --- include/linux/resctrl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 880351ca3dfc..b8f8240050b4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -358,7 +358,7 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); -__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); /** * resctrl_arch_mon_event_config_write() - Write the config for an event. @@ -514,7 +514,7 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; -int __init resctrl_init(void); -void __exit resctrl_exit(void); +int resctrl_init(void); +void resctrl_exit(void); #endif /* _RESCTRL_H */ -- cgit v1.2.3 From eff042ddf4b9587fa7394d502b93e06ec6015177 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:50 -0400 Subject: sunrpc: Add a helper to derive maxpages from sv_max_mesg This page count is to be used to allocate various arrays of pages and bio_vecs, replacing the fixed RPCSVC_MAXPAGES value. The documenting comment is somewhat stale -- of course NFSv4 COMPOUND procedures may have multiple payloads. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 74658cca0f38..749f8e53e8a6 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -159,6 +159,23 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); #define RPCSVC_MAXPAGES ((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE \ + 2 + 1) +/** + * svc_serv_maxpages - maximum count of pages needed for one RPC message + * @serv: RPC service context + * + * Returns a count of pages or vectors that can hold the maximum + * size RPC message for @serv. + * + * Each request/reply pair can have at most one "payload", plus two + * pages, one for the request, and one for the reply. + * nfsd_splice_actor() might need an extra page when a READ payload + * is not page-aligned. + */ +static inline unsigned long svc_serv_maxpages(const struct svc_serv *serv) +{ + return DIV_ROUND_UP(serv->sv_max_mesg, PAGE_SIZE) + 2 + 1; +} + /* * The context of a single thread, including the request currently being * processed. -- cgit v1.2.3 From ed603bcf4feac05df937bf78bc7feb75f988a971 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:52 -0400 Subject: sunrpc: Replace the rq_pages array with dynamically-allocated memory As a step towards making NFSD's maximum rsize and wsize variable at run-time, replace the fixed-size rq_vec[] array in struct svc_rqst with a chunk of dynamically-allocated memory. On a system with 8-byte pointers and 4KB pages, pahole reports that the rq_pages[] array is 2080 bytes. This patch replaces that with a single 8-byte pointer field. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 749f8e53e8a6..57be69539888 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -205,7 +205,8 @@ struct svc_rqst { struct xdr_stream rq_res_stream; struct page *rq_scratch_page; struct xdr_buf rq_res; - struct page *rq_pages[RPCSVC_MAXPAGES + 1]; + unsigned long rq_maxpages; /* num of entries in rq_pages */ + struct page * *rq_pages; struct page * *rq_respages; /* points into rq_pages */ struct page * *rq_next_page; /* next reply page to use */ struct page * *rq_page_end; /* one past the last page */ -- cgit v1.2.3 From 59cf7346542babdaae99e72365174eab4fa276ac Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:54 -0400 Subject: sunrpc: Replace the rq_bvec array with dynamically-allocated memory As a step towards making NFSD's maximum rsize and wsize variable at run-time, replace the fixed-size rq_bvec[] array in struct svc_rqst with a chunk of dynamically-allocated memory. The rq_bvec[] array contains enough bio_vecs to handle each page in a maximum size RPC message. On a system with 8-byte pointers and 4KB pages, pahole reports that the rq_bvec[] array is 4144 bytes. This patch replaces that array with a single 8-byte pointer field. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 57be69539888..538bea716c6b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -213,7 +213,7 @@ struct svc_rqst { struct folio_batch rq_fbatch; struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ - struct bio_vec rq_bvec[RPCSVC_MAXPAGES]; + struct bio_vec *rq_bvec; __be32 rq_xid; /* transmission id */ u32 rq_prog; /* program number */ -- cgit v1.2.3 From f2e597353d50f05a600dddbf84a362839cf1c224 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 9 May 2025 13:39:23 -0400 Subject: NFSD: De-duplicate the svc_fill_write_vector() call sites All three call sites do the same thing. I'm struggling with this a bit, however. struct xdr_buf is an XDR layer object and unmarshaling a WRITE payload is clearly a task intended to be done by the proc and xdr functions, not by VFS. This feels vaguely like a layering violation. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 538bea716c6b..dec636345ee2 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -471,7 +471,7 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - struct xdr_buf *payload); + const struct xdr_buf *payload); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total); -- cgit v1.2.3 From b406c6b78198f123e08061c45d56803459785975 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 8 May 2025 11:56:40 -0400 Subject: SUNRPC: Remove svc_fill_write_vector() Clean up: This API is no longer used. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index dec636345ee2..510ee1927977 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -470,8 +470,6 @@ const char * svc_proc_name(const struct svc_rqst *rqstp); int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - const struct xdr_buf *payload); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total); -- cgit v1.2.3 From 1259560b988c959abf9b84da0b9e56c2863b6837 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 6 May 2025 20:15:08 -0400 Subject: SUNRPC: Remove svc_rqst :: rq_vec Clean up: This array is no longer used. On a system with 8-byte pointers and 4KB pages, pahole reports that the rq_vec[] array accounts for 4144 bytes. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 510ee1927977..6540af4b9bdb 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -212,7 +212,6 @@ struct svc_rqst { struct page * *rq_page_end; /* one past the last page */ struct folio_batch rq_fbatch; - struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ struct bio_vec *rq_bvec; __be32 rq_xid; /* transmission id */ -- cgit v1.2.3 From f4126823c1fd677d9811a0252830c6c73ddb7e99 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:55 -0400 Subject: sunrpc: Adjust size of socket's receive page array dynamically As a step towards making NFSD's maximum rsize and wsize variable at run-time, make sk_pages a flexible array. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svcsock.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index bf45d9e8492a..963bbe251e52 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -40,7 +40,9 @@ struct svc_sock { struct completion sk_handshake_done; - struct page * sk_pages[RPCSVC_MAXPAGES]; /* received data */ + /* received data */ + unsigned long sk_maxpages; + struct page * sk_pages[] __counted_by(sk_maxpages); }; static inline u32 svc_sock_reclen(struct svc_sock *svsk) -- cgit v1.2.3 From 81381d1a90dea186ce4585d9bd3cdc40b50e9c48 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:56 -0400 Subject: svcrdma: Adjust the number of entries in svc_rdma_recv_ctxt::rc_pages Allow allocation of more entries in the rc_pages[] array when the maximum size of an RPC message is increased. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 619fc0bd837a..1016f2feddc4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -202,7 +202,8 @@ struct svc_rdma_recv_ctxt { struct svc_rdma_pcl rc_reply_pcl; unsigned int rc_page_count; - struct page *rc_pages[RPCSVC_MAXPAGES]; + unsigned long rc_maxpages; + struct page *rc_pages[] __counted_by(rc_maxpages); }; /* -- cgit v1.2.3 From 56ab43f50d708e155f013ef60cc1b4fe39bed42e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:57 -0400 Subject: svcrdma: Adjust the number of entries in svc_rdma_send_ctxt::sc_pages Allow allocation of more entries in the sc_pages[] array when the maximum size of an RPC message is increased. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 1016f2feddc4..22704c2e5b9b 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -245,7 +245,8 @@ struct svc_rdma_send_ctxt { void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; - struct page *sc_pages[RPCSVC_MAXPAGES]; + unsigned long sc_maxpages; + struct page **sc_pages; struct ib_sge sc_sges[]; }; -- cgit v1.2.3 From 0af165bb903cdc005066494cc931076dafd76894 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:36:58 -0400 Subject: sunrpc: Remove the RPCSVC_MAXPAGES macro It is no longer used. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 6540af4b9bdb..d57df042e24a 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -150,14 +150,7 @@ extern u32 svc_max_payload(const struct svc_rqst *rqstp); * list. xdr_buf.tail points to the end of the first page. * This assumes that the non-page part of an rpc reply will fit * in a page - NFSd ensures this. lockd also has no trouble. - * - * Each request/reply pair can have at most one "payload", plus two pages, - * one for the request, and one for the reply. - * We using ->sendfile to return read data, we might need one extra page - * if the request is not page-aligned. So add another '1'. */ -#define RPCSVC_MAXPAGES ((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE \ - + 2 + 1) /** * svc_serv_maxpages - maximum count of pages needed for one RPC message -- cgit v1.2.3 From 1e7dbad6d1fe4b35ccd2aa8fea9496d837af4430 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Apr 2025 15:37:02 -0400 Subject: SUNRPC: Bump the maximum payload size for the server Increase the maximum server-side RPC payload to 4MB. The default remains at 1MB. An API to adjust the operational maximum was added in 2006 by commit 596bbe53eb3a ("[PATCH] knfsd: Allow max size of NFSd payload to be configured"). To adjust the operational maximum using this API, shut down the NFS server. Then echo a new value into: /proc/fs/nfsd/max_block_size And restart the NFS server. Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index d57df042e24a..48666b83fe68 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -119,14 +119,14 @@ void svc_destroy(struct svc_serv **svcp); * Linux limit; someone who cares more about NFS/UDP performance * can test a larger number. * - * For TCP transports we have more freedom. A size of 1MB is - * chosen to match the client limit. Other OSes are known to - * have larger limits, but those numbers are probably beyond - * the point of diminishing returns. + * For non-UDP transports we have more freedom. A size of 4MB is + * chosen to accommodate clients that support larger I/O sizes. */ -#define RPCSVC_MAXPAYLOAD (1*1024*1024u) -#define RPCSVC_MAXPAYLOAD_TCP RPCSVC_MAXPAYLOAD -#define RPCSVC_MAXPAYLOAD_UDP (32*1024u) +enum { + RPCSVC_MAXPAYLOAD = 4 * 1024 * 1024, + RPCSVC_MAXPAYLOAD_TCP = RPCSVC_MAXPAYLOAD, + RPCSVC_MAXPAYLOAD_UDP = 32 * 1024, +}; extern u32 svc_max_payload(const struct svc_rqst *rqstp); -- cgit v1.2.3 From d4fb6b8e4640adeff9673ae398525be7382d3197 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 15 May 2025 16:58:41 +0000 Subject: x86/resctrl: Add end-marker to the resctrl_event_id enum The resctrl_event_id enum gives names to the counter event numbers on x86. These are used directly by resctrl. To allow the MPAM driver to keep an array of these the size of the enum needs to be known. Add a 'num_events' enum entry which can be used to size an array. This is added to the enum to reduce conflicts with another series, which in turn requires get_arch_mbm_state() to have a default case. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Tony Luck Link: https://lore.kernel.org/20250515165855.31452-12-james.morse@arm.com --- include/linux/resctrl_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index f26450b3326b..69bf740130ac 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -49,6 +49,9 @@ enum resctrl_event_id { QOS_L3_OCCUP_EVENT_ID = 0x01, QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, + + /* Must be the last */ + QOS_NUM_EVENTS, }; #endif /* __LINUX_RESCTRL_TYPES_H */ -- cgit v1.2.3 From bff70402d6d67843fe319338e4c56e1cba13fbd8 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 15 May 2025 16:58:45 +0000 Subject: fs/resctrl: Add boiler plate for external resctrl code Add Makefile and Kconfig for fs/resctrl. Add ARCH_HAS_CPU_RESCTRL for the common parts of the resctrl interface and make X86_CPU_RESCTRL select this. Adding an include of asm/resctrl.h to linux/resctrl.h allows the /fs/resctrl files to switch over to using this header instead. Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Tested-by: Fenghua Yu Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Tested-by: Tony Luck Link: https://lore.kernel.org/20250515165855.31452-16-james.morse@arm.com --- include/linux/resctrl.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index b8f8240050b4..5c7c8bf2c47f 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -8,6 +8,10 @@ #include #include +#ifdef CONFIG_ARCH_HAS_CPU_RESCTRL +#include +#endif + /* CLOSID, RMID value used by the default control group */ #define RESCTRL_RESERVED_CLOSID 0 #define RESCTRL_RESERVED_RMID 0 -- cgit v1.2.3 From 3d95a49b365e06d1b4c4e5a426fdc70449a334f3 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 15 May 2025 16:58:46 +0000 Subject: x86/resctrl: Move the filesystem bits to headers visible to fs/resctrl Once the filesystem parts of resctrl move to fs/resctrl, it cannot rely on definitions in x86's internal.h. Move definitions in internal.h that need to be shared between the filesystem and architecture code to header files that fs/resctrl can include. Doing this separately means the filesystem code only moves between files of the same name, instead of having these changes mixed in too. Co-developed-by: Dave Martin Signed-off-by: Dave Martin Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Shaopeng Tan Reviewed-by: Tony Luck Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Tested-by: Fenghua Yu Tested-by: Carl Worth # arm64 Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Amit Singh Tomar # arm64 Tested-by: Shanker Donthineni # arm64 Tested-by: Babu Moger Tested-by: Tony Luck Link: https://lore.kernel.org/20250515165855.31452-17-james.morse@arm.com --- include/linux/resctrl.h | 3 +++ include/linux/resctrl_types.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 5c7c8bf2c47f..b9d1f2916e9c 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -403,6 +403,9 @@ static inline u32 resctrl_get_config_index(u32 closid, } } +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index 69bf740130ac..a66e7936943e 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -7,6 +7,9 @@ #ifndef __LINUX_RESCTRL_TYPES_H #define __LINUX_RESCTRL_TYPES_H +#define MAX_MBA_BW 100u +#define MBM_OVERFLOW_INTERVAL 1000 + /* Reads to Local DRAM Memory */ #define READS_TO_LOCAL_MEM BIT(0) -- cgit v1.2.3 From 7bdb619c7f9f0a7264b2d69ad0472bf2c785e52a Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 15 May 2025 16:58:47 +0000 Subject: x86/resctrl: Move enum resctrl_event_id to resctrl.h resctrl_types.h contains common types and constants to enable architectures to use these types in their definitions within asm/resctrl.h. enum resctrl_event_id was placed in resctrl_types.h for resctrl_arch_get_cdp_enabled() and resctrl_arch_set_cdp_enabled(), but these two functions are no longer inlined by any architecture. Move enum resctrl_event_id to resctrl.h. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Tony Luck Link: https://lore.kernel.org/20250515165855.31452-18-james.morse@arm.com --- include/linux/resctrl.h | 10 ++++++++++ include/linux/resctrl_types.h | 10 ---------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index b9d1f2916e9c..5ef972cbf56b 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -48,6 +48,16 @@ int proc_resctrl_show(struct seq_file *m, for_each_rdt_resource((r)) \ if ((r)->mon_capable) +enum resctrl_res_level { + RDT_RESOURCE_L3, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + RDT_RESOURCE_SMBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + /** * enum resctrl_conf_type - The type of configuration. * @CDP_NONE: No prioritisation, both code and data are controlled or monitored. diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index a66e7936943e..a25fb9c4070d 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,16 +34,6 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) -enum resctrl_res_level { - RDT_RESOURCE_L3, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - RDT_RESOURCE_SMBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - /* * Event IDs, the values match those used to program IA32_QM_EVTSEL before * reading IA32_QM_CTR on RDT systems. -- cgit v1.2.3 From 279f225951e3c77a1708d77f557b4cc7bdbd76ed Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 15 May 2025 16:58:49 +0000 Subject: x86/resctrl: Move pseudo lock prototypes to include/linux/resctrl.h The resctrl pseudo-lock feature allows an architecture to allocate data into particular cache portions, which are then treated as reserved to avoid that data ever being evicted. Setting this up is deeply architecture specific as it involves disabling prefetchers etc. It is not possible to support this kind of feature on arm64. Risc-V is assumed to be the same. The prototypes for the architecture code were added to x86's asm/resctrl.h, with other architectures able to provide stubs for their architecture. This forces other architectures to provide identical stubs. Move the prototypes and stubs to linux/resctrl.h, and switch between them using the existing Kconfig symbol. Signed-off-by: James Morse Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Babu Moger Tested-by: Shaopeng Tan Tested-by: Tony Luck Link: https://lore.kernel.org/20250515165855.31452-20-james.morse@arm.com --- include/linux/resctrl.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 5ef972cbf56b..9ba771f2ddea 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -534,4 +534,17 @@ extern unsigned int resctrl_rmid_realloc_limit; int resctrl_init(void); void resctrl_exit(void); +#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK +u64 resctrl_arch_get_prefetch_disable_bits(void); +int resctrl_arch_pseudo_lock_fn(void *_plr); +int resctrl_arch_measure_cycles_lat_fn(void *_plr); +int resctrl_arch_measure_l2_residency(void *_plr); +int resctrl_arch_measure_l3_residency(void *_plr); +#else +static inline u64 resctrl_arch_get_prefetch_disable_bits(void) { return 0; } +static inline int resctrl_arch_pseudo_lock_fn(void *_plr) { return 0; } +static inline int resctrl_arch_measure_cycles_lat_fn(void *_plr) { return 0; } +static inline int resctrl_arch_measure_l2_residency(void *_plr) { return 0; } +static inline int resctrl_arch_measure_l3_residency(void *_plr) { return 0; } +#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ #endif /* _RESCTRL_H */ -- cgit v1.2.3 From e847a847aea5728dfcd13b558b9d82b79f9a85c7 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:04 +0100 Subject: irqdomain: Drop of_node_to_fwnode() All uses of of_node_to_fwnode() in non-irqdomain code were changed to "officially" defined of_fwnode_handle(). Therefore, the former can be dropped along with the last uses in the irqdomain code. Due to merge logistics the inline cannot be dropped immediately. Move it to a deprecated section, which will be removed during the merge window. [ tglx: Handle merge logistics ] Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-12-jirislaby@kernel.org --- include/linux/irqdomain.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index df7e9278c8ac..91ed86feac07 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -358,11 +358,6 @@ int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity); -static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) -{ - return node ? &node->fwnode : NULL; -} - extern const struct fwnode_operations irqchip_fwnode_ops; static inline bool is_fwnode_irqchip(const struct fwnode_handle *fwnode) @@ -387,7 +382,7 @@ struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, static inline struct irq_domain *irq_find_matching_host(struct device_node *node, enum irq_domain_bus_token bus_token) { - return irq_find_matching_fwnode(of_node_to_fwnode(node), bus_token); + return irq_find_matching_fwnode(of_fwnode_handle(node), bus_token); } static inline struct irq_domain *irq_find_host(struct device_node *node) @@ -407,7 +402,7 @@ static inline struct irq_domain *irq_domain_add_simple(struct device_node *of_no const struct irq_domain_ops *ops, void *host_data) { - return irq_domain_create_simple(of_node_to_fwnode(of_node), size, first_irq, ops, host_data); + return irq_domain_create_simple(of_fwnode_handle(of_node), size, first_irq, ops, host_data); } /** @@ -423,7 +418,7 @@ static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_no void *host_data) { struct irq_domain_info info = { - .fwnode = of_node_to_fwnode(of_node), + .fwnode = of_fwnode_handle(of_node), .size = size, .hwirq_max = size, .ops = ops, @@ -442,7 +437,7 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod void *host_data) { struct irq_domain_info info = { - .fwnode = of_node_to_fwnode(of_node), + .fwnode = of_fwnode_handle(of_node), .hwirq_max = max_irq, .direct_max = max_irq, .ops = ops, @@ -462,7 +457,7 @@ static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node void *host_data) { struct irq_domain_info info = { - .fwnode = of_node_to_fwnode(of_node), + .fwnode = of_fwnode_handle(of_node), .hwirq_max = ~0U, .ops = ops, .host_data = host_data, @@ -611,7 +606,7 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par void *host_data) { return irq_domain_create_hierarchy(parent, flags, size, - of_node_to_fwnode(node), + of_fwnode_handle(node), ops, host_data); } @@ -755,6 +750,12 @@ static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsig } #endif +/* Deprecated functions. Will be removed in the merge window */ +static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) +{ + return node ? &node->fwnode : NULL; +} + #else /* CONFIG_IRQ_DOMAIN */ static inline void irq_dispose_mapping(unsigned int virq) { } static inline struct irq_domain *irq_find_matching_fwnode( -- cgit v1.2.3 From c7131b12080ad9c74eadd4f62386c8642168bec7 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:05 +0100 Subject: irqdomain: Make irq_domain_create_hierarchy() an inline There is no reason to export the function as an extra symbol. It is simple enough and is just a wrapper to already exported functions. Therefore, switch the exported function to an inline. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-13-jirislaby@kernel.org --- include/linux/irqdomain.h | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 91ed86feac07..6e9a5ec2986f 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -591,12 +591,45 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, void *handler_data, const char *handler_name); void irq_domain_reset_irq_data(struct irq_data *irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, - unsigned int flags, - unsigned int size, - struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, - void *host_data); +/** + * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy + * @parent: Parent irq domain to associate with the new domain + * @flags: Irq domain flags associated to the domain + * @size: Size of the domain. See below + * @fwnode: Optional fwnode of the interrupt controller + * @ops: Pointer to the interrupt domain callbacks + * @host_data: Controller private data pointer + * + * If @size is 0 a tree domain is created, otherwise a linear domain. + * + * If successful the parent is associated to the new domain and the + * domain flags are set. + * Returns pointer to IRQ domain, or NULL on failure. + */ +static inline struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, + unsigned int flags, + unsigned int size, + struct fwnode_handle *fwnode, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain_info info = { + .fwnode = fwnode, + .size = size, + .hwirq_max = size, + .ops = ops, + .host_data = host_data, + .domain_flags = flags, + .parent = parent, + }; + struct irq_domain *d; + + if (!info.size) + info.hwirq_max = ~0U; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; +} static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, unsigned int flags, -- cgit v1.2.3 From 9cf19f061ccc98b63367b16551c290016b7d6b13 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:10 +0100 Subject: gpio: Switch to irq_domain_create_*() irq_domain_add_*() interfaces are going away as being obsolete now. Switch to the preferred irq_domain_create_*() ones. Those differ in the node parameter: They take more generic struct fwnode_handle instead of struct device_node. Therefore, of_fwnode_handle() is added around the original parameter. Note some of the users can likely use dev->fwnode directly instead of indirect of_fwnode_handle(dev->of_node). But dev->fwnode is not guaranteed to be set for all, so this has to be investigated on case to case basis (by people who can actually test with the HW). [ tglx: Fix up subject prefix ] Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-18-jirislaby@kernel.org --- include/linux/gpio/driver.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 4c0294a9104d..b53233051bee 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -287,8 +287,9 @@ struct gpio_irq_chip { /** * @first: * - * Required for static IRQ allocation. If set, irq_domain_add_simple() - * will allocate and map all IRQs during initialization. + * Required for static IRQ allocation. If set, + * irq_domain_create_simple() will allocate and map all IRQs + * during initialization. */ unsigned int first; -- cgit v1.2.3 From 813da4f379e789c428d2e0a44730f852e090d47d Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:32 +0100 Subject: powerpc: Switch irq_domain_add_nomap() to use fwnode All irq_domain_add_*() functions are going away. PowerPC is the only user of irq_domain_add_nomap() and there is no irq_domain_create_nomap() complement. Therefore, to align with the rest of the kernel, rename irq_domain_add_nomap() to irq_domain_create_nomap() and accept a fwnode_handle instead of a device_node. [ tglx: Fix up subject prefix ] Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-40-jirislaby@kernel.org --- include/linux/irqdomain.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 6e9a5ec2986f..f3c79f908a28 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -431,13 +431,13 @@ static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_no } #ifdef CONFIG_IRQ_DOMAIN_NOMAP -static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, +static inline struct irq_domain *irq_domain_create_nomap(struct fwnode_handle *fwnode, unsigned int max_irq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain_info info = { - .fwnode = of_fwnode_handle(of_node), + .fwnode = fwnode, .hwirq_max = max_irq, .direct_max = max_irq, .ops = ops, -- cgit v1.2.3 From 42b8b16fe56c206fde7fbae0116769d0addef4b7 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:33 +0100 Subject: irqdomain: Drop irq_domain_add_*() functions Most irq_domain_add_*() functions are unused now, so drop them. The remaining ones are moved to the deprecated section and will be removed during the merge window after the patches in various trees have been merged. Note: The Chinese docs are touched but unfinished. I cannot parse those. [ tglx: Remove the leftover in irq-domain.rst and handle merge logistics ] Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-41-jirislaby@kernel.org --- include/linux/irqdomain.h | 102 ++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f3c79f908a28..712c662de981 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -338,12 +338,6 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, unsigned int first_irq, const struct irq_domain_ops *ops, void *host_data); -struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - irq_hw_number_t first_hwirq, - const struct irq_domain_ops *ops, - void *host_data); struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, @@ -396,40 +390,6 @@ static inline struct irq_domain *irq_find_host(struct device_node *node) return d; } -static inline struct irq_domain *irq_domain_add_simple(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - const struct irq_domain_ops *ops, - void *host_data) -{ - return irq_domain_create_simple(of_fwnode_handle(of_node), size, first_irq, ops, host_data); -} - -/** - * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: Number of interrupts in the domain. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - */ -static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_node, - unsigned int size, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain_info info = { - .fwnode = of_fwnode_handle(of_node), - .size = size, - .hwirq_max = size, - .ops = ops, - .host_data = host_data, - }; - struct irq_domain *d; - - d = irq_domain_instantiate(&info); - return IS_ERR(d) ? NULL : d; -} - #ifdef CONFIG_IRQ_DOMAIN_NOMAP static inline struct irq_domain *irq_domain_create_nomap(struct fwnode_handle *fwnode, unsigned int max_irq, @@ -452,22 +412,6 @@ static inline struct irq_domain *irq_domain_create_nomap(struct fwnode_handle *f unsigned int irq_create_direct_mapping(struct irq_domain *domain); #endif -static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain_info info = { - .fwnode = of_fwnode_handle(of_node), - .hwirq_max = ~0U, - .ops = ops, - .host_data = host_data, - }; - struct irq_domain *d; - - d = irq_domain_instantiate(&info); - return IS_ERR(d) ? NULL : d; -} - static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle *fwnode, unsigned int size, const struct irq_domain_ops *ops, @@ -631,18 +575,6 @@ static inline struct irq_domain *irq_domain_create_hierarchy(struct irq_domain * return IS_ERR(d) ? NULL : d; } -static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, - unsigned int flags, - unsigned int size, - struct device_node *node, - const struct irq_domain_ops *ops, - void *host_data) -{ - return irq_domain_create_hierarchy(parent, flags, size, - of_fwnode_handle(node), - ops, host_data); -} - int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, bool realloc, @@ -789,6 +721,40 @@ static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) return node ? &node->fwnode : NULL; } +static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain_info info = { + .fwnode = of_fwnode_handle(of_node), + .hwirq_max = ~0U, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; +} + +static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_node, + unsigned int size, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain_info info = { + .fwnode = of_fwnode_handle(of_node), + .size = size, + .hwirq_max = size, + .ops = ops, + .host_data = host_data, + }; + struct irq_domain *d; + + d = irq_domain_instantiate(&info); + return IS_ERR(d) ? NULL : d; +} + #else /* CONFIG_IRQ_DOMAIN */ static inline void irq_dispose_mapping(unsigned int virq) { } static inline struct irq_domain *irq_find_matching_fwnode( -- cgit v1.2.3 From 14ebb11ba895c9223d9a453a17df2fd81410c96c Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:42 +0100 Subject: irqdomain: Drop irq_linear_revmap() irq_linear_revmap() is deprecated and unused now. So remove it. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-50-jirislaby@kernel.org --- include/linux/irqdomain.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 712c662de981..c8e55cd1e352 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -492,12 +492,6 @@ static inline unsigned int irq_find_mapping(struct irq_domain *domain, return 0; } -static inline unsigned int irq_linear_revmap(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - return irq_find_mapping(domain, hwirq); -} - extern const struct irq_domain_ops irq_domain_simple_ops; /* stock xlate functions */ -- cgit v1.2.3 From 18e743e911020eb74cc4eaf549c18ed12a5acb9a Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:43 +0100 Subject: irqdomain: Use irq_domain_instantiate()'s return value as initializers This makes the code more compact. Note that irq_domain_create_hierarchy()'s handling of size is now part of info's initializer (using the ternary operator). That makes more sense while reading it. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-51-jirislaby@kernel.org --- include/linux/irqdomain.h | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index c8e55cd1e352..1e3858438e69 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -403,9 +403,8 @@ static inline struct irq_domain *irq_domain_create_nomap(struct fwnode_handle *f .ops = ops, .host_data = host_data, }; - struct irq_domain *d; + struct irq_domain *d = irq_domain_instantiate(&info); - d = irq_domain_instantiate(&info); return IS_ERR(d) ? NULL : d; } @@ -424,9 +423,8 @@ static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle * .ops = ops, .host_data = host_data, }; - struct irq_domain *d; + struct irq_domain *d = irq_domain_instantiate(&info); - d = irq_domain_instantiate(&info); return IS_ERR(d) ? NULL : d; } @@ -440,9 +438,8 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw .ops = ops, .host_data = host_data, }; - struct irq_domain *d; + struct irq_domain *d = irq_domain_instantiate(&info); - d = irq_domain_instantiate(&info); return IS_ERR(d) ? NULL : d; } @@ -554,18 +551,14 @@ static inline struct irq_domain *irq_domain_create_hierarchy(struct irq_domain * struct irq_domain_info info = { .fwnode = fwnode, .size = size, - .hwirq_max = size, + .hwirq_max = size ? : ~0U, .ops = ops, .host_data = host_data, .domain_flags = flags, .parent = parent, }; - struct irq_domain *d; + struct irq_domain *d = irq_domain_instantiate(&info); - if (!info.size) - info.hwirq_max = ~0U; - - d = irq_domain_instantiate(&info); return IS_ERR(d) ? NULL : d; } -- cgit v1.2.3 From 66cbf17fe67156608421e717975c415a85c08466 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:44 +0100 Subject: irqdomain: Make struct irq_domain_info variables const This is just expressing it explicitly as irq_domain_instantiate() takes const already. No functional change. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-52-jirislaby@kernel.org --- include/linux/irqdomain.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 1e3858438e69..66a26dfd254b 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -396,7 +396,7 @@ static inline struct irq_domain *irq_domain_create_nomap(struct fwnode_handle *f const struct irq_domain_ops *ops, void *host_data) { - struct irq_domain_info info = { + const struct irq_domain_info info = { .fwnode = fwnode, .hwirq_max = max_irq, .direct_max = max_irq, @@ -416,7 +416,7 @@ static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle * const struct irq_domain_ops *ops, void *host_data) { - struct irq_domain_info info = { + const struct irq_domain_info info = { .fwnode = fwnode, .size = size, .hwirq_max = size, @@ -432,7 +432,7 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw const struct irq_domain_ops *ops, void *host_data) { - struct irq_domain_info info = { + const struct irq_domain_info info = { .fwnode = fwnode, .hwirq_max = ~0, .ops = ops, @@ -548,7 +548,7 @@ static inline struct irq_domain *irq_domain_create_hierarchy(struct irq_domain * const struct irq_domain_ops *ops, void *host_data) { - struct irq_domain_info info = { + const struct irq_domain_info info = { .fwnode = fwnode, .size = size, .hwirq_max = size ? : ~0U, -- cgit v1.2.3 From 2272a78b3f4a7a1621f00ac6e9c9232d12b7ff01 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:45 +0100 Subject: irqdomain: Improve kernel-docs of functions Many of irqdomain.h's functions are referenced in Documentation/ but are not properly documented. Therefore, document these. And use "Returns:" tag consistently, so that it is properly generated in the resulting docs. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-53-jirislaby@kernel.org --- include/linux/irqdomain.h | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 66a26dfd254b..a70e2ba0b91f 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -411,6 +411,15 @@ static inline struct irq_domain *irq_domain_create_nomap(struct fwnode_handle *f unsigned int irq_create_direct_mapping(struct irq_domain *domain); #endif +/** + * irq_domain_create_linear - Allocate and register a linear revmap irq_domain. + * @fwnode: pointer to interrupt controller's FW node. + * @size: Number of interrupts in the domain. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + * + * Returns: Newly created irq_domain + */ static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle *fwnode, unsigned int size, const struct irq_domain_ops *ops, @@ -457,6 +466,18 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); void irq_dispose_mapping(unsigned int virq); +/** + * irq_create_mapping - Map a hardware interrupt into linux irq space + * @domain: domain owning this hardware interrupt or NULL for default domain + * @hwirq: hardware irq number in that domain space + * + * Only one mapping per hardware interrupt is permitted. + * + * If the sense/trigger is to be specified, set_irq_type() should be called + * on the number returned from that call. + * + * Returns: Linux irq number or 0 on error + */ static inline unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { @@ -467,6 +488,13 @@ struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, unsigned int *irq); +/** + * irq_resolve_mapping - Find a linux irq from a hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space + * + * Returns: Interrupt descriptor + */ static inline struct irq_desc *irq_resolve_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { @@ -477,6 +505,8 @@ static inline struct irq_desc *irq_resolve_mapping(struct irq_domain *domain, * irq_find_mapping() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space + * + * Returns: Linux irq number or 0 if not found */ static inline unsigned int irq_find_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) @@ -539,7 +569,8 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data); * * If successful the parent is associated to the new domain and the * domain flags are set. - * Returns pointer to IRQ domain, or NULL on failure. + * + * Returns: A pointer to IRQ domain, or %NULL on failure. */ static inline struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, unsigned int flags, @@ -570,6 +601,15 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); int irq_domain_activate_irq(struct irq_data *irq_data, bool early); void irq_domain_deactivate_irq(struct irq_data *irq_data); +/** + * irq_domain_alloc_irqs - Allocate IRQs from domain + * @domain: domain to allocate from + * @nr_irqs: number of IRQs to allocate + * @node: NUMA node id for memory allocation + * @arg: domain specific argument + * + * See __irq_domain_alloc_irqs()' documentation. + */ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) { -- cgit v1.2.3 From a4efe303e50e3222591ab6ec5eb0ea92b7260d96 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 19 Mar 2025 10:29:49 +0100 Subject: Documentation: irqdomain: Update it The irqdomain documentaion became obsolete over time. Update and extend it a bit with respect to the current code and HW. Most notably the doubled documentation of irq_domain (from .rst and .h) was unified and let only in .rst. A reference link was added to .h. Furthermore: * Add some 'struct' keywords, so that the respective structs are hyperlinked * :c:member: use where appropriate to mark a member of a struct * Rephrase some wording to improve readability/understanding Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250319092951.37667-57-jirislaby@kernel.org --- include/linux/irqdomain.h | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index a70e2ba0b91f..1a1786dc19d2 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -1,30 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * irq_domain - IRQ translation domains + * irq_domain - IRQ Translation Domains * - * Translation infrastructure between hw and linux irq numbers. This is - * helpful for interrupt controllers to implement mapping between hardware - * irq numbers and the Linux irq number space. - * - * irq_domains also have hooks for translating device tree or other - * firmware interrupt representations into a hardware irq number that - * can be mapped back to a Linux irq number without any extra platform - * support code. - * - * Interrupt controller "domain" data structure. This could be defined as a - * irq domain controller. That is, it handles the mapping between hardware - * and virtual interrupt numbers for a given interrupt domain. The domain - * structure is generally created by the PIC code for a given PIC instance - * (though a domain can cover more than one PIC if they have a flat number - * model). It's the domain callbacks that are responsible for setting the - * irq_chip on a given irq_desc after it's been mapped. - * - * The host code and data structures use a fwnode_handle pointer to - * identify the domain. In some cases, and in order to preserve source - * code compatibility, this fwnode pointer is "upgraded" to a DT - * device_node. For those firmware infrastructures that do not provide - * a unique identifier for an interrupt controller, the irq_domain - * code offers a fwnode allocator. + * See Documentation/core-api/irq/irq-domain.rst for the details. */ #ifndef _LINUX_IRQDOMAIN_H -- cgit v1.2.3 From 38c1e73fdeb37962324a3265ef95618dfa4552ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 6 May 2025 14:22:59 +0200 Subject: irqdomain: Consolidate coding style Now that the file has been thrown through the mincer, finish the job and consolidate the coding style. Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 257 ++++++++++++++++++++-------------------------- 1 file changed, 113 insertions(+), 144 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 1a1786dc19d2..2f6e4c9dd743 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -39,9 +39,9 @@ struct msi_parent_ops; * pass a device-specific description of an interrupt. */ struct irq_fwspec { - struct fwnode_handle *fwnode; - int param_count; - u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS]; + struct fwnode_handle *fwnode; + int param_count; + u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS]; }; /* Conversion function from of_phandle_args fields to fwspec */ @@ -50,26 +50,26 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, /** * struct irq_domain_ops - Methods for irq_domain objects - * @match: Match an interrupt controller device node to a domain, returns - * 1 on a match - * @select: Match an interrupt controller fw specification. It is more generic - * than @match as it receives a complete struct irq_fwspec. Therefore, - * @select is preferred if provided. Returns 1 on a match. - * @map: Create or update a mapping between a virtual irq number and a hw - * irq number. This is called only once for a given mapping. - * @unmap: Dispose of such a mapping - * @xlate: Given a device tree node and interrupt specifier, decode - * the hardware irq number and linux irq type value. - * @alloc: Allocate @nr_irqs interrupts starting from @virq. - * @free: Free @nr_irqs interrupts starting from @virq. - * @activate: Activate one interrupt in HW (@irqd). If @reserve is set, only - * reserve the vector. If unset, assign the vector (called from - * request_irq()). - * @deactivate: Disarm one interrupt (@irqd). - * @translate: Given @fwspec, decode the hardware irq number (@out_hwirq) and - * linux irq type value (@out_type). This is a generalised @xlate - * (over struct irq_fwspec) and is preferred if provided. - * @debug_show: For domains to show specific data for an interrupt in debugfs. + * @match: Match an interrupt controller device node to a domain, returns + * 1 on a match + * @select: Match an interrupt controller fw specification. It is more generic + * than @match as it receives a complete struct irq_fwspec. Therefore, + * @select is preferred if provided. Returns 1 on a match. + * @map: Create or update a mapping between a virtual irq number and a hw + * irq number. This is called only once for a given mapping. + * @unmap: Dispose of such a mapping + * @xlate: Given a device tree node and interrupt specifier, decode + * the hardware irq number and linux irq type value. + * @alloc: Allocate @nr_irqs interrupts starting from @virq. + * @free: Free @nr_irqs interrupts starting from @virq. + * @activate: Activate one interrupt in HW (@irqd). If @reserve is set, only + * reserve the vector. If unset, assign the vector (called from + * request_irq()). + * @deactivate: Disarm one interrupt (@irqd). + * @translate: Given @fwspec, decode the hardware irq number (@out_hwirq) and + * linux irq type value (@out_type). This is a generalised @xlate + * (over struct irq_fwspec) and is preferred if provided. + * @debug_show: For domains to show specific data for an interrupt in debugfs. * * Functions below are provided by the driver and called whenever a new mapping * is created or an old mapping is disposed. The driver can then proceed to @@ -77,29 +77,29 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, * to setup the irq_desc when returning from map(). */ struct irq_domain_ops { - int (*match)(struct irq_domain *d, struct device_node *node, - enum irq_domain_bus_token bus_token); - int (*select)(struct irq_domain *d, struct irq_fwspec *fwspec, - enum irq_domain_bus_token bus_token); - int (*map)(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw); - void (*unmap)(struct irq_domain *d, unsigned int virq); - int (*xlate)(struct irq_domain *d, struct device_node *node, - const u32 *intspec, unsigned int intsize, - unsigned long *out_hwirq, unsigned int *out_type); + int (*match)(struct irq_domain *d, struct device_node *node, + enum irq_domain_bus_token bus_token); + int (*select)(struct irq_domain *d, struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token); + int (*map)(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw); + void (*unmap)(struct irq_domain *d, unsigned int virq); + int (*xlate)(struct irq_domain *d, struct device_node *node, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* extended V2 interfaces to support hierarchy irq_domains */ - int (*alloc)(struct irq_domain *d, unsigned int virq, - unsigned int nr_irqs, void *arg); - void (*free)(struct irq_domain *d, unsigned int virq, - unsigned int nr_irqs); - int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool reserve); - void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); - int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, - unsigned long *out_hwirq, unsigned int *out_type); + int (*alloc)(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg); + void (*free)(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs); + int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool reserve); + void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); + int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); #endif #ifdef CONFIG_GENERIC_IRQ_DEBUGFS - void (*debug_show)(struct seq_file *m, struct irq_domain *d, - struct irq_data *irqd, int ind); + void (*debug_show)(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind); #endif }; @@ -222,8 +222,7 @@ static inline struct device_node *irq_domain_get_of_node(struct irq_domain *d) return to_of_node(d->fwnode); } -static inline void irq_domain_set_pm_device(struct irq_domain *d, - struct device *dev) +static inline void irq_domain_set_pm_device(struct irq_domain *d, struct device *dev) { if (d) d->pm_dev = dev; @@ -239,14 +238,12 @@ enum { IRQCHIP_FWNODE_NAMED_ID, }; -static inline -struct fwnode_handle *irq_domain_alloc_named_fwnode(const char *name) +static inline struct fwnode_handle *irq_domain_alloc_named_fwnode(const char *name) { return __irq_domain_alloc_fwnode(IRQCHIP_FWNODE_NAMED, 0, name, NULL); } -static inline -struct fwnode_handle *irq_domain_alloc_named_id_fwnode(const char *name, int id) +static inline struct fwnode_handle *irq_domain_alloc_named_id_fwnode(const char *name, int id) { return __irq_domain_alloc_fwnode(IRQCHIP_FWNODE_NAMED_ID, id, name, NULL); @@ -311,23 +308,17 @@ struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info); struct irq_domain *devm_irq_domain_instantiate(struct device *dev, const struct irq_domain_info *info); -struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, - unsigned int size, +struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, - const struct irq_domain_ops *ops, - void *host_data); -struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, - unsigned int size, - unsigned int first_irq, - irq_hw_number_t first_hwirq, - const struct irq_domain_ops *ops, - void *host_data); + const struct irq_domain_ops *ops, void *host_data); +struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, unsigned int size, + unsigned int first_irq, irq_hw_number_t first_hwirq, + const struct irq_domain_ops *ops, void *host_data); struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); void irq_set_default_domain(struct irq_domain *domain); struct irq_domain *irq_get_default_domain(void); -int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node, +int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity); extern const struct fwnode_operations irqchip_fwnode_ops; @@ -337,12 +328,10 @@ static inline bool is_fwnode_irqchip(const struct fwnode_handle *fwnode) return fwnode && fwnode->ops == &irqchip_fwnode_ops; } -void irq_domain_update_bus_token(struct irq_domain *domain, - enum irq_domain_bus_token bus_token); +void irq_domain_update_bus_token(struct irq_domain *domain, enum irq_domain_bus_token bus_token); -static inline -struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, - enum irq_domain_bus_token bus_token) +static inline struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, + enum irq_domain_bus_token bus_token) { struct irq_fwspec fwspec = { .fwnode = fwnode, @@ -370,9 +359,9 @@ static inline struct irq_domain *irq_find_host(struct device_node *node) #ifdef CONFIG_IRQ_DOMAIN_NOMAP static inline struct irq_domain *irq_domain_create_nomap(struct fwnode_handle *fwnode, - unsigned int max_irq, - const struct irq_domain_ops *ops, - void *host_data) + unsigned int max_irq, + const struct irq_domain_ops *ops, + void *host_data) { const struct irq_domain_info info = { .fwnode = fwnode, @@ -391,17 +380,17 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain); /** * irq_domain_create_linear - Allocate and register a linear revmap irq_domain. - * @fwnode: pointer to interrupt controller's FW node. - * @size: Number of interrupts in the domain. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer + * @fwnode: pointer to interrupt controller's FW node. + * @size: Number of interrupts in the domain. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer * * Returns: Newly created irq_domain */ static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle *fwnode, - unsigned int size, - const struct irq_domain_ops *ops, - void *host_data) + unsigned int size, + const struct irq_domain_ops *ops, + void *host_data) { const struct irq_domain_info info = { .fwnode = fwnode, @@ -416,8 +405,8 @@ static inline struct irq_domain *irq_domain_create_linear(struct fwnode_handle * } static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, - void *host_data) + const struct irq_domain_ops *ops, + void *host_data) { const struct irq_domain_info info = { .fwnode = fwnode, @@ -432,22 +421,19 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw void irq_domain_remove(struct irq_domain *domain); -int irq_domain_associate(struct irq_domain *domain, unsigned int irq, - irq_hw_number_t hwirq); -void irq_domain_associate_many(struct irq_domain *domain, - unsigned int irq_base, +int irq_domain_associate(struct irq_domain *domain, unsigned int irq, irq_hw_number_t hwirq); +void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count); -unsigned int irq_create_mapping_affinity(struct irq_domain *domain, - irq_hw_number_t hwirq, +unsigned int irq_create_mapping_affinity(struct irq_domain *domain, irq_hw_number_t hwirq, const struct irq_affinity_desc *affinity); unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); void irq_dispose_mapping(unsigned int virq); /** * irq_create_mapping - Map a hardware interrupt into linux irq space - * @domain: domain owning this hardware interrupt or NULL for default domain - * @hwirq: hardware irq number in that domain space + * @domain: domain owning this hardware interrupt or NULL for default domain + * @hwirq: hardware irq number in that domain space * * Only one mapping per hardware interrupt is permitted. * @@ -456,8 +442,7 @@ void irq_dispose_mapping(unsigned int virq); * * Returns: Linux irq number or 0 on error */ -static inline unsigned int irq_create_mapping(struct irq_domain *domain, - irq_hw_number_t hwirq) +static inline unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { return irq_create_mapping_affinity(domain, hwirq, NULL); } @@ -468,8 +453,8 @@ struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, /** * irq_resolve_mapping - Find a linux irq from a hw irq number. - * @domain: domain owning this hardware interrupt - * @hwirq: hardware irq number in that domain space + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * * Returns: Interrupt descriptor */ @@ -481,8 +466,8 @@ static inline struct irq_desc *irq_resolve_mapping(struct irq_domain *domain, /** * irq_find_mapping() - Find a linux irq from a hw irq number. - * @domain: domain owning this hardware interrupt - * @hwirq: hardware irq number in that domain space + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * * Returns: Linux irq number or 0 if not found */ @@ -501,14 +486,14 @@ extern const struct irq_domain_ops irq_domain_simple_ops; /* stock xlate functions */ int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, - const u32 *intspec, unsigned int intsize, - irq_hw_number_t *out_hwirq, unsigned int *out_type); + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type); int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, - const u32 *intspec, unsigned int intsize, - irq_hw_number_t *out_hwirq, unsigned int *out_type); + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type); int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, - const u32 *intspec, unsigned int intsize, - irq_hw_number_t *out_hwirq, unsigned int *out_type); + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type); int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type); @@ -525,12 +510,9 @@ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest); int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest); /* V2 interfaces to support hierarchy IRQ domains. */ -struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, - unsigned int virq); -void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, - void *chip_data, irq_flow_handler_t handler, +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq); +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq, + const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name); void irq_domain_reset_irq_data(struct irq_data *irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -551,11 +533,10 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data); * Returns: A pointer to IRQ domain, or %NULL on failure. */ static inline struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, - unsigned int flags, - unsigned int size, - struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, - void *host_data) + unsigned int flags, unsigned int size, + struct fwnode_handle *fwnode, + const struct irq_domain_ops *ops, + void *host_data) { const struct irq_domain_info info = { .fwnode = fwnode, @@ -571,9 +552,8 @@ static inline struct irq_domain *irq_domain_create_hierarchy(struct irq_domain * return IS_ERR(d) ? NULL : d; } -int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, - unsigned int nr_irqs, int node, void *arg, - bool realloc, +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, + int node, void *arg, bool realloc, const struct irq_affinity_desc *affinity); void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); int irq_domain_activate_irq(struct irq_data *irq_data, bool early); @@ -588,37 +568,29 @@ void irq_domain_deactivate_irq(struct irq_data *irq_data); * * See __irq_domain_alloc_irqs()' documentation. */ -static inline int irq_domain_alloc_irqs(struct irq_domain *domain, - unsigned int nr_irqs, int node, void *arg) +static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, + int node, void *arg) { - return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false, - NULL); + return __irq_domain_alloc_irqs(domain, -1, nr_irqs, node, arg, false, NULL); } -int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, - unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, +int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data); -void irq_domain_free_irqs_common(struct irq_domain *domain, - unsigned int virq, +void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs); -void irq_domain_free_irqs_top(struct irq_domain *domain, - unsigned int virq, unsigned int nr_irqs); +void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs); int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg); int irq_domain_pop_irq(struct irq_domain *domain, int virq); -int irq_domain_alloc_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, +int irq_domain_alloc_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg); -void irq_domain_free_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, +void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs); -int irq_domain_disconnect_hierarchy(struct irq_domain *domain, - unsigned int virq); +int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq); static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { @@ -627,8 +599,7 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) static inline bool irq_domain_is_ipi(struct irq_domain *domain) { - return domain->flags & - (IRQ_DOMAIN_FLAG_IPI_PER_CPU | IRQ_DOMAIN_FLAG_IPI_SINGLE); + return domain->flags & (IRQ_DOMAIN_FLAG_IPI_PER_CPU | IRQ_DOMAIN_FLAG_IPI_SINGLE); } static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain) @@ -657,14 +628,13 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain) } #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -static inline int irq_domain_alloc_irqs(struct irq_domain *domain, - unsigned int nr_irqs, int node, void *arg) +static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, + int node, void *arg) { return -1; } -static inline void irq_domain_free_irqs(unsigned int virq, - unsigned int nr_irqs) { } +static inline void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) { } static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { @@ -704,8 +674,7 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain) #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_MSI_IRQ -int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, - unsigned int type); +int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, unsigned int type); void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq); #else static inline int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, @@ -727,8 +696,8 @@ static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) } static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data) + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain_info info = { .fwnode = of_fwnode_handle(of_node), @@ -743,9 +712,9 @@ static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node } static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_node, - unsigned int size, - const struct irq_domain_ops *ops, - void *host_data) + unsigned int size, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain_info info = { .fwnode = of_fwnode_handle(of_node), @@ -762,8 +731,8 @@ static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_no #else /* CONFIG_IRQ_DOMAIN */ static inline void irq_dispose_mapping(unsigned int virq) { } -static inline struct irq_domain *irq_find_matching_fwnode( - struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token) +static inline struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, + enum irq_domain_bus_token bus_token) { return NULL; } -- cgit v1.2.3 From e51b27438a10391fdc94dd2046d9ffa9c2679c74 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 May 2025 18:28:11 +0100 Subject: irqchip: Make irq-msi-lib.h globally available Move irq-msi-lib.h into include/linux/irqchip, making it available to compilation units outside of drivers/irqchip. This requires some churn in drivers to fetch it from the new location, generated using this script: git grep -l -w \"irq-msi-lib.h\" | \ xargs sed -i -e 's:"irq-msi-lib.h":\:' Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250513172819.2216709-2-maz@kernel.org --- include/linux/irqchip/irq-msi-lib.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 include/linux/irqchip/irq-msi-lib.h (limited to 'include/linux') diff --git a/include/linux/irqchip/irq-msi-lib.h b/include/linux/irqchip/irq-msi-lib.h new file mode 100644 index 000000000000..dd8d1d138544 --- /dev/null +++ b/include/linux/irqchip/irq-msi-lib.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2022 Linutronix GmbH +// Copyright (C) 2022 Intel + +#ifndef _IRQCHIP_IRQ_MSI_LIB_H +#define _IRQCHIP_IRQ_MSI_LIB_H + +#include +#include +#include + +#ifdef CONFIG_PCI_MSI +#define MATCH_PCI_MSI BIT(DOMAIN_BUS_PCI_MSI) +#else +#define MATCH_PCI_MSI (0) +#endif + +#define MATCH_PLATFORM_MSI BIT(DOMAIN_BUS_PLATFORM_MSI) + +int msi_lib_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token); + +bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, + struct msi_domain_info *info); + +#endif /* _IRQCHIP_IRQ_MSI_LIB_H */ -- cgit v1.2.3 From e4d001b54f78769ba1a1404c2801ae95e19fd893 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 13 May 2025 18:28:12 +0100 Subject: genirq/msi: Add helper for creating MSI-parent irq domains Creating an irq domain that serves as an MSI parent requires a substantial amount of esoteric boiler-plate code, some of which is often provided twice (such as the bus token). To make things a bit simpler for the unsuspecting MSI tinkerer, provide a helper that does it for them, and serves as documentation of what needs to be provided. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250513172819.2216709-3-maz@kernel.org --- include/linux/msi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index f4b94ccaf0c1..6863540f4b71 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -636,6 +636,10 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent); +struct irq_domain_info; +struct irq_domain *msi_create_parent_irq_domain(struct irq_domain_info *info, + const struct msi_parent_ops *msi_parent_ops); + bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, const struct msi_domain_template *template, unsigned int hwsize, void *domain_data, -- cgit v1.2.3 From dcd21b609d4abc7303f8683bce4f35d78d7d6830 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 27 Apr 2025 18:21:06 -0400 Subject: NFS: Avoid flushing data while holding directory locks in nfs_rename() The Linux client assumes that all filehandles are non-volatile for renames within the same directory (otherwise sillyrename cannot work). However, the existence of the Linux 'subtree_check' export option has meant that nfs_rename() has always assumed it needs to flush writes before attempting to rename. Since NFSv4 does allow the client to query whether or not the server exhibits this behaviour, and since knfsd does actually set the appropriate flag when 'subtree_check' is enabled on an export, it should be OK to optimise away the write flushing behaviour in the cases where it is clearly not needed. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton --- include/linux/nfs_fs_sb.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 71319637a84e..ee03f3cef30c 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -213,6 +213,15 @@ struct nfs_server { char *fscache_uniq; /* Uniquifier (or NULL) */ #endif + /* The following #defines numerically match the NFSv4 equivalents */ +#define NFS_FH_NOEXPIRE_WITH_OPEN (0x1) +#define NFS_FH_VOLATILE_ANY (0x2) +#define NFS_FH_VOL_MIGRATION (0x4) +#define NFS_FH_VOL_RENAME (0x8) +#define NFS_FH_RENAME_UNSAFE (NFS_FH_VOLATILE_ANY | NFS_FH_VOL_RENAME) + u32 fh_expire_type; /* V4 bitmask representing file + handle volatility type for + this filesystem */ u32 pnfs_blksize; /* layout_blksize attr */ #if IS_ENABLED(CONFIG_NFS_V4) u32 attr_bitmask[3];/* V4 bitmask representing the set @@ -236,9 +245,6 @@ struct nfs_server { u32 acl_bitmask; /* V4 bitmask representing the ACEs that are supported on this filesystem */ - u32 fh_expire_type; /* V4 bitmask representing file - handle volatility type for - this filesystem */ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ struct rpc_wait_queue roc_rpcwaitq; void *pnfs_ld_data; /* per mount point data */ -- cgit v1.2.3 From 7b151e4efdde7cc7cfaae66e497d12487a70c6e9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 14 May 2025 20:54:29 +0200 Subject: net: phy: fixed_phy: remove fixed_phy_register_with_gpiod Since its introduction 6 yrs ago this functions has never had a user. So remove it. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/ccbeef28-65ae-4e28-b1db-816c44338dee@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 1acafd86ab13..3392c09b5d24 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -13,7 +13,6 @@ struct fixed_phy_status { }; struct device_node; -struct gpio_desc; struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) @@ -24,11 +23,6 @@ extern struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, struct device_node *np); -extern struct phy_device * -fixed_phy_register_with_gpiod(unsigned int irq, - struct fixed_phy_status *status, - struct gpio_desc *gpiod); - extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, @@ -46,14 +40,6 @@ static inline struct phy_device *fixed_phy_register(unsigned int irq, return ERR_PTR(-ENODEV); } -static inline struct phy_device * -fixed_phy_register_with_gpiod(unsigned int irq, - struct fixed_phy_status *status, - struct gpio_desc *gpiod) -{ - return ERR_PTR(-ENODEV); -} - static inline void fixed_phy_unregister(struct phy_device *phydev) { } -- cgit v1.2.3 From c46286fdd6aa1d0e33c245bcffe9ff2428a777bd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 May 2025 18:49:26 +0200 Subject: mr: consolidate the ipmr_can_free_table() checks. Guoyu Yin reported a splat in the ipmr netns cleanup path: WARNING: CPU: 2 PID: 14564 at net/ipv4/ipmr.c:440 ipmr_free_table net/ipv4/ipmr.c:440 [inline] WARNING: CPU: 2 PID: 14564 at net/ipv4/ipmr.c:440 ipmr_rules_exit+0x135/0x1c0 net/ipv4/ipmr.c:361 Modules linked in: CPU: 2 UID: 0 PID: 14564 Comm: syz.4.838 Not tainted 6.14.0 #1 Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:ipmr_free_table net/ipv4/ipmr.c:440 [inline] RIP: 0010:ipmr_rules_exit+0x135/0x1c0 net/ipv4/ipmr.c:361 Code: ff df 48 c1 ea 03 80 3c 02 00 75 7d 48 c7 83 60 05 00 00 00 00 00 00 5b 5d 41 5c 41 5d 41 5e e9 71 67 7f 00 e8 4c 2d 8a fd 90 <0f> 0b 90 eb 93 e8 41 2d 8a fd 0f b6 2d 80 54 ea 01 31 ff 89 ee e8 RSP: 0018:ffff888109547c58 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff888108c12dc0 RCX: ffffffff83e09868 RDX: ffff8881022b3300 RSI: ffffffff83e098d4 RDI: 0000000000000005 RBP: ffff888104288000 R08: 0000000000000000 R09: ffffed10211825c9 R10: 0000000000000001 R11: ffff88801816c4a0 R12: 0000000000000001 R13: ffff888108c13320 R14: ffff888108c12dc0 R15: fffffbfff0b74058 FS: 00007f84f39316c0(0000) GS:ffff88811b100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f84f3930f98 CR3: 0000000113b56000 CR4: 0000000000350ef0 Call Trace: ipmr_net_exit_batch+0x50/0x90 net/ipv4/ipmr.c:3160 ops_exit_list+0x10c/0x160 net/core/net_namespace.c:177 setup_net+0x47d/0x8e0 net/core/net_namespace.c:394 copy_net_ns+0x25d/0x410 net/core/net_namespace.c:516 create_new_namespaces+0x3f6/0xaf0 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc3/0x180 kernel/nsproxy.c:228 ksys_unshare+0x78d/0x9a0 kernel/fork.c:3342 __do_sys_unshare kernel/fork.c:3413 [inline] __se_sys_unshare kernel/fork.c:3411 [inline] __x64_sys_unshare+0x31/0x40 kernel/fork.c:3411 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xa6/0x1a0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f84f532cc29 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f84f3931038 EFLAGS: 00000246 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 00007f84f5615fa0 RCX: 00007f84f532cc29 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000040000400 RBP: 00007f84f53fba18 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f84f5615fa0 R15: 00007fff51c5f328 The running kernel has CONFIG_IP_MROUTE_MULTIPLE_TABLES disabled, and the sanity check for such build is still too loose. Address the issue consolidating the relevant sanity check in a single helper regardless of the kernel configuration. Also share it between the ipv4 and ipv6 code. Reported-by: Guoyu Yin Fixes: 50b94204446e ("ipmr: tune the ipmr_can_free_table() checks.") Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/372dc261e1bf12742276e1b984fc5a071b7fc5a8.1747321903.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 58a2401e4b55..0075f6e5c3da 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -262,6 +262,11 @@ struct mr_table { int mroute_reg_vif_num; }; +static inline bool mr_can_free_table(struct net *net) +{ + return !check_net(net) || !net_initialized(net); +} + #ifdef CONFIG_IP_MROUTE_COMMON void vif_device_init(struct vif_device *v, struct net_device *dev, -- cgit v1.2.3 From c80bdb1c55719cd6308d648a7920272a3be09e34 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 May 2025 13:16:44 -0600 Subject: io_uring: pass in struct io_big_cqe to io_alloc_ocqe() Rather than pass extra1/extra2 separately, just pass in the (now) named io_big_cqe struct instead. The callers that don't use/support CQE32 will now just pass a single NULL, rather than two seperate mystery zero values. Move the clearing of the big_cqe elements into io_alloc_ocqe() as well, so it can get moved out of the generic code. Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 00dbd7cd0e7d..2922635986f5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -710,7 +710,7 @@ struct io_kiocb { const struct cred *creds; struct io_wq_work work; - struct { + struct io_big_cqe { u64 extra1; u64 extra2; } big_cqe; -- cgit v1.2.3 From c451e2da54bce183fbb270ec01ab3ca725ddf943 Mon Sep 17 00:00:00 2001 From: Sumanth Gavini Date: Sun, 18 May 2025 01:57:32 -0700 Subject: regulator: max8952: Correct Samsung "Electronics" spelling in copyright headers Fix the misspelling of 'Electronics' in max8952 driver copyright headers. Signed-off-by: Sumanth Gavini Link: https://patch.msgid.link/20250518085734.88890-7-sumanth.gavini@yahoo.com Signed-off-by: Mark Brown --- include/linux/regulator/max8952.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regulator/max8952.h b/include/linux/regulator/max8952.h index 8712c091abf0..61dcd8e00a2f 100644 --- a/include/linux/regulator/max8952.h +++ b/include/linux/regulator/max8952.h @@ -2,7 +2,7 @@ /* * max8952.h - Voltage regulation for the Maxim 8952 * - * Copyright (C) 2010 Samsung Electrnoics + * Copyright (C) 2010 Samsung Electronics * MyungJoo Ham */ -- cgit v1.2.3 From ec23a899d96f9ee3389abe6c516d09cae2fde5e3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 16 May 2025 15:32:24 +0200 Subject: spi: sh-msiof: Move register definitions to Move the MSIOF register and register bit definitions from the MSIOF SPI driver to the existing header file , so they can be shared with the MSIOF I2S driver. Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/066d1086973eb309006258484e9fe8138807e565.1747401908.git.geert+renesas@glider.be Signed-off-by: Mark Brown --- include/linux/spi/sh_msiof.h | 125 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/sh_msiof.h b/include/linux/spi/sh_msiof.h index f950d280461b..9fbef3fd4056 100644 --- a/include/linux/spi/sh_msiof.h +++ b/include/linux/spi/sh_msiof.h @@ -2,6 +2,131 @@ #ifndef __SPI_SH_MSIOF_H__ #define __SPI_SH_MSIOF_H__ +#include +#include + +#define SITMDR1 0x00 /* Transmit Mode Register 1 */ +#define SITMDR2 0x04 /* Transmit Mode Register 2 */ +#define SITMDR3 0x08 /* Transmit Mode Register 3 */ +#define SIRMDR1 0x10 /* Receive Mode Register 1 */ +#define SIRMDR2 0x14 /* Receive Mode Register 2 */ +#define SIRMDR3 0x18 /* Receive Mode Register 3 */ +#define SITSCR 0x20 /* Transmit Clock Select Register */ +#define SIRSCR 0x22 /* Receive Clock Select Register (SH, A1, APE6) */ +#define SICTR 0x28 /* Control Register */ +#define SIFCTR 0x30 /* FIFO Control Register */ +#define SISTR 0x40 /* Status Register */ +#define SIIER 0x44 /* Interrupt Enable Register */ +#define SITDR1 0x48 /* Transmit Control Data Register 1 (SH, A1) */ +#define SITDR2 0x4c /* Transmit Control Data Register 2 (SH, A1) */ +#define SITFDR 0x50 /* Transmit FIFO Data Register */ +#define SIRDR1 0x58 /* Receive Control Data Register 1 (SH, A1) */ +#define SIRDR2 0x5c /* Receive Control Data Register 2 (SH, A1) */ +#define SIRFDR 0x60 /* Receive FIFO Data Register */ + +/* SITMDR1 and SIRMDR1 */ +#define SIMDR1_TRMD BIT(31) /* Transfer Mode (1 = Master mode) */ +#define SIMDR1_SYNCMD GENMASK(29, 28) /* SYNC Mode */ +#define SIMDR1_SYNCMD_PULSE 0U /* Frame start sync pulse */ +#define SIMDR1_SYNCMD_SPI 2U /* Level mode/SPI */ +#define SIMDR1_SYNCMD_LR 3U /* L/R mode */ +#define SIMDR1_SYNCAC BIT(25) /* Sync Polarity (1 = Active-low) */ +#define SIMDR1_BITLSB BIT(24) /* MSB/LSB First (1 = LSB first) */ +#define SIMDR1_DTDL GENMASK(22, 20) /* Data Pin Bit Delay for MSIOF_SYNC */ +#define SIMDR1_SYNCDL GENMASK(18, 16) /* Frame Sync Signal Timing Delay */ +#define SIMDR1_FLD GENMASK(3, 2) /* Frame Sync Signal Interval (0-3) */ +#define SIMDR1_XXSTP BIT(0) /* Transmission/Reception Stop on FIFO */ +/* SITMDR1 */ +#define SITMDR1_PCON BIT(30) /* Transfer Signal Connection */ +#define SITMDR1_SYNCCH GENMASK(27, 26) /* Sync Signal Channel Select */ + /* 0=MSIOF_SYNC, 1=MSIOF_SS1, 2=MSIOF_SS2 */ + +/* SITMDR2 and SIRMDR2 */ +#define SIMDR2_GRP GENMASK(31, 30) /* Group Count */ +#define SIMDR2_BITLEN1 GENMASK(28, 24) /* Data Size (8-32 bits) */ +#define SIMDR2_WDLEN1 GENMASK(23, 16) /* Word Count (1-64/256 (SH, A1))) */ +#define SIMDR2_GRPMASK GENMASK(3, 0) /* Group Output Mask 1-4 (SH, A1) */ + +/* SITMDR3 and SIRMDR3 */ +#define SIMDR3_BITLEN2 GENMASK(28, 24) /* Data Size (8-32 bits) */ +#define SIMDR3_WDLEN2 GENMASK(23, 16) /* Word Count (1-64/256 (SH, A1))) */ + +/* SITSCR and SIRSCR */ +#define SISCR_BRPS GENMASK(12, 8) /* Prescaler Setting (1-32) */ +#define SISCR_BRDV GENMASK(2, 0) /* Baud Rate Generator's Division Ratio */ + +/* SICTR */ +#define SICTR_TSCKIZ GENMASK(31, 30) /* Transmit Clock I/O Polarity Select */ +#define SICTR_TSCKIZ_SCK BIT(31) /* Disable SCK when TX disabled */ +#define SICTR_TSCKIZ_POL BIT(30) /* Transmit Clock Polarity */ +#define SICTR_RSCKIZ GENMASK(29, 28) /* Receive Clock Polarity Select */ +#define SICTR_RSCKIZ_SCK BIT(29) /* Must match CTR_TSCKIZ_SCK */ +#define SICTR_RSCKIZ_POL BIT(28) /* Receive Clock Polarity */ +#define SICTR_TEDG BIT(27) /* Transmit Timing (1 = falling edge) */ +#define SICTR_REDG BIT(26) /* Receive Timing (1 = falling edge) */ +#define SICTR_TXDIZ GENMASK(23, 22) /* Pin Output When TX is Disabled */ +#define SICTR_TXDIZ_LOW 0U /* 0 */ +#define SICTR_TXDIZ_HIGH 1U /* 1 */ +#define SICTR_TXDIZ_HIZ 2U /* High-impedance */ +#define SICTR_TSCKE BIT(15) /* Transmit Serial Clock Output Enable */ +#define SICTR_TFSE BIT(14) /* Transmit Frame Sync Signal Output Enable */ +#define SICTR_TXE BIT(9) /* Transmit Enable */ +#define SICTR_RXE BIT(8) /* Receive Enable */ +#define SICTR_TXRST BIT(1) /* Transmit Reset */ +#define SICTR_RXRST BIT(0) /* Receive Reset */ + +/* SIFCTR */ +#define SIFCTR_TFWM GENMASK(31, 29) /* Transmit FIFO Watermark */ +#define SIFCTR_TFWM_64 0U /* Transfer Request when 64 empty stages */ +#define SIFCTR_TFWM_32 1U /* Transfer Request when 32 empty stages */ +#define SIFCTR_TFWM_24 2U /* Transfer Request when 24 empty stages */ +#define SIFCTR_TFWM_16 3U /* Transfer Request when 16 empty stages */ +#define SIFCTR_TFWM_12 4U /* Transfer Request when 12 empty stages */ +#define SIFCTR_TFWM_8 5U /* Transfer Request when 8 empty stages */ +#define SIFCTR_TFWM_4 6U /* Transfer Request when 4 empty stages */ +#define SIFCTR_TFWM_1 7U /* Transfer Request when 1 empty stage */ +#define SIFCTR_TFUA GENMASK(28, 20) /* Transmit FIFO Usable Area */ +#define SIFCTR_RFWM GENMASK(15, 13) /* Receive FIFO Watermark */ +#define SIFCTR_RFWM_1 0U /* Transfer Request when 1 valid stages */ +#define SIFCTR_RFWM_4 1U /* Transfer Request when 4 valid stages */ +#define SIFCTR_RFWM_8 2U /* Transfer Request when 8 valid stages */ +#define SIFCTR_RFWM_16 3U /* Transfer Request when 16 valid stages */ +#define SIFCTR_RFWM_32 4U /* Transfer Request when 32 valid stages */ +#define SIFCTR_RFWM_64 5U /* Transfer Request when 64 valid stages */ +#define SIFCTR_RFWM_128 6U /* Transfer Request when 128 valid stages */ +#define SIFCTR_RFWM_256 7U /* Transfer Request when 256 valid stages */ +#define SIFCTR_RFUA GENMASK(12, 4) /* Receive FIFO Usable Area (0x40 = full) */ + +/* SISTR */ +#define SISTR_TFEMP BIT(29) /* Transmit FIFO Empty */ +#define SISTR_TDREQ BIT(28) /* Transmit Data Transfer Request */ +#define SISTR_TEOF BIT(23) /* Frame Transmission End */ +#define SISTR_TFSERR BIT(21) /* Transmit Frame Synchronization Error */ +#define SISTR_TFOVF BIT(20) /* Transmit FIFO Overflow */ +#define SISTR_TFUDF BIT(19) /* Transmit FIFO Underflow */ +#define SISTR_RFFUL BIT(13) /* Receive FIFO Full */ +#define SISTR_RDREQ BIT(12) /* Receive Data Transfer Request */ +#define SISTR_REOF BIT(7) /* Frame Reception End */ +#define SISTR_RFSERR BIT(5) /* Receive Frame Synchronization Error */ +#define SISTR_RFUDF BIT(4) /* Receive FIFO Underflow */ +#define SISTR_RFOVF BIT(3) /* Receive FIFO Overflow */ + +/* SIIER */ +#define SIIER_TDMAE BIT(31) /* Transmit Data DMA Transfer Req. Enable */ +#define SIIER_TFEMPE BIT(29) /* Transmit FIFO Empty Enable */ +#define SIIER_TDREQE BIT(28) /* Transmit Data Transfer Request Enable */ +#define SIIER_TEOFE BIT(23) /* Frame Transmission End Enable */ +#define SIIER_TFSERRE BIT(21) /* Transmit Frame Sync Error Enable */ +#define SIIER_TFOVFE BIT(20) /* Transmit FIFO Overflow Enable */ +#define SIIER_TFUDFE BIT(19) /* Transmit FIFO Underflow Enable */ +#define SIIER_RDMAE BIT(15) /* Receive Data DMA Transfer Req. Enable */ +#define SIIER_RFFULE BIT(13) /* Receive FIFO Full Enable */ +#define SIIER_RDREQE BIT(12) /* Receive Data Transfer Request Enable */ +#define SIIER_REOFE BIT(7) /* Frame Reception End Enable */ +#define SIIER_RFSERRE BIT(5) /* Receive Frame Sync Error Enable */ +#define SIIER_RFUDFE BIT(4) /* Receive FIFO Underflow Enable */ +#define SIIER_RFOVFE BIT(3) /* Receive FIFO Overflow Enable */ + enum { MSIOF_SPI_HOST, MSIOF_SPI_TARGET, -- cgit v1.2.3 From 08d6ee6d8a10aef958c2af16bb121070290ed589 Mon Sep 17 00:00:00 2001 From: Nikhil Jha Date: Wed, 19 Mar 2025 13:02:39 -0400 Subject: sunrpc: implement rfc2203 rpcsec_gss seqnum cache This implements a sequence number cache of the last three (right now hardcoded) sent sequence numbers for a given XID, as suggested by the RFC. From RFC2203 5.3.3.1: "Note that the sequence number algorithm requires that the client increment the sequence number even if it is retrying a request with the same RPC transaction identifier. It is not infrequent for clients to get into a situation where they send two or more attempts and a slow server sends the reply for the first attempt. With RPCSEC_GSS, each request and reply will have a unique sequence number. If the client wishes to improve turn around time on the RPC call, it can cache the RPCSEC_GSS sequence number of each request it sends. Then when it receives a response with a matching RPC transaction identifier, it can compute the checksum of each sequence number in the cache to try to match the checksum in the reply's verifier." Signed-off-by: Nikhil Jha Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 81b952649d35..f46d1fb8f71a 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -30,6 +30,8 @@ #define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) +#define RPC_GSS_SEQNO_ARRAY_SIZE 3U + enum rpc_display_format_t { RPC_DISPLAY_ADDR = 0, RPC_DISPLAY_PORT, @@ -66,7 +68,8 @@ struct rpc_rqst { struct rpc_cred * rq_cred; /* Bound cred */ __be32 rq_xid; /* request XID */ int rq_cong; /* has incremented xprt->cong */ - u32 rq_seqno; /* gss seq no. used on req. */ + u32 rq_seqnos[RPC_GSS_SEQNO_ARRAY_SIZE]; /* past gss req seq nos. */ + unsigned int rq_seqno_count; /* number of entries in rq_seqnos */ int rq_enc_pages_num; struct page **rq_enc_pages; /* scratch pages for use by gss privacy code */ @@ -119,6 +122,18 @@ struct rpc_rqst { #define rq_svec rq_snd_buf.head #define rq_slen rq_snd_buf.len +static inline int xprt_rqst_add_seqno(struct rpc_rqst *req, u32 seqno) +{ + if (likely(req->rq_seqno_count < RPC_GSS_SEQNO_ARRAY_SIZE)) + req->rq_seqno_count++; + + /* Shift array to make room for the newest element at the beginning */ + memmove(&req->rq_seqnos[1], &req->rq_seqnos[0], + (RPC_GSS_SEQNO_ARRAY_SIZE - 1) * sizeof(req->rq_seqnos[0])); + req->rq_seqnos[0] = seqno; + return 0; +} + /* RPC transport layer security policies */ enum xprtsec_policies { RPC_XPRTSEC_NONE = 0, -- cgit v1.2.3 From e5296637a322b840d772f88f4d58b4bc72b29058 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 8 Apr 2025 09:43:15 -0400 Subject: nfs: add a refcount tracker for struct net as held by the nfs_client These are long-held references to the netns, so make sure the refcount tracker is aware of them. Signed-off-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ee03f3cef30c..e02f4c4e9cc4 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -125,6 +125,7 @@ struct nfs_client { */ char cl_ipaddr[48]; struct net *cl_net; + netns_tracker cl_ns_tracker; struct list_head pending_cb_stateids; struct rcu_head rcu; -- cgit v1.2.3 From 1cb0f56d96185cb20e63e191fc291191823e6f52 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 19 May 2025 15:43:57 +0200 Subject: bpf: WARN_ONCE on verifier bugs Throughout the verifier's logic, there are multiple checks for inconsistent states that should never happen and would indicate a verifier bug. These bugs are typically logged in the verifier logs and sometimes preceded by a WARN_ONCE. This patch reworks these checks to consistently emit a verifier log AND a warning when CONFIG_DEBUG_KERNEL is enabled. The consistent use of WARN_ONCE should help fuzzers (ex. syzkaller) expose any situation where they are actually able to reach one of those buggy verifier states. Acked-by: Andrii Nakryiko Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/aCs1nYvNNMq8dAWP@mail.gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/linux/bpf_verifier.h | 11 +++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83c56f40842b..5b25d278409b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -346,6 +346,12 @@ static inline const char *btf_field_type_name(enum btf_field_type type) } } +#if IS_ENABLED(CONFIG_DEBUG_KERNEL) +#define BPF_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) +#else +#define BPF_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) +#endif + static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cedd66867ecf..78c97e12ea4e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -839,6 +839,17 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, u32 insn_off, const char *prefix_fmt, ...); +#define verifier_bug_if(cond, env, fmt, args...) \ + ({ \ + bool __cond = (cond); \ + if (unlikely(__cond)) { \ + BPF_WARN_ONCE(1, "verifier bug: " fmt "(" #cond ")\n", ##args); \ + bpf_log(&env->log, "verifier bug: " fmt "(" #cond ")\n", ##args); \ + } \ + (__cond); \ + }) +#define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) + static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; -- cgit v1.2.3 From 541a4219bd66bef56d93dbd306dc64a4d70ae99e Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 14 May 2025 17:19:33 -0700 Subject: cgroup: compare css to cgroup::self in helper for distingushing css Adjust the implementation of css_is_cgroup() so that it compares the given css to cgroup::self. Rename the function to css_is_self() in order to reflect that. Change the existing css->ss NULL check to a warning in the true branch. Finally, adjust call sites to use the new function name. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1f5b0a4a3356..989c08b09691 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -347,9 +347,15 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } -static inline bool css_is_cgroup(struct cgroup_subsys_state *css) +static inline bool css_is_self(struct cgroup_subsys_state *css) { - return css->ss == NULL; + if (css == &css->cgroup->self) { + /* cgroup::self should not have subsystem association */ + WARN_ON(css->ss != NULL); + return true; + } + + return false; } static inline void cgroup_get(struct cgroup *cgrp) -- cgit v1.2.3 From 4d6d35d3417dcab14a9b1b9771c3cd62e8ed442b Mon Sep 17 00:00:00 2001 From: "Yo-Jung (Leo) Lin" Date: Wed, 30 Apr 2025 19:26:16 +0800 Subject: i2c: smbus: introduce Write Disable-aware SPD instantiating functions Some SMBus controllers may restrict writes to addresses where SPD sensors may reside. This may lead to some SPD sensors not functioning correctly, and might need extra handling. Introduce new SPD-instantiating functions that are aware of this, and use them instead. Signed-off-by: Yo-Jung Lin (Leo) Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20250430-for-upstream-i801-spd5118-no-instantiate-v2-1-2f54d91ae2c7@canonical.com Signed-off-by: Andi Shyti --- include/linux/i2c-smbus.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c-smbus.h b/include/linux/i2c-smbus.h index ced1c6ead52a..dc1bd2ab4c13 100644 --- a/include/linux/i2c-smbus.h +++ b/include/linux/i2c-smbus.h @@ -44,9 +44,11 @@ static inline void i2c_free_slave_host_notify_device(struct i2c_client *client) #endif #if IS_ENABLED(CONFIG_I2C_SMBUS) && IS_ENABLED(CONFIG_DMI) -void i2c_register_spd(struct i2c_adapter *adap); +void i2c_register_spd_write_disable(struct i2c_adapter *adap); +void i2c_register_spd_write_enable(struct i2c_adapter *adap); #else -static inline void i2c_register_spd(struct i2c_adapter *adap) { } +static inline void i2c_register_spd_write_disable(struct i2c_adapter *adap) { } +static inline void i2c_register_spd_write_enable(struct i2c_adapter *adap) { } #endif #endif /* _LINUX_I2C_SMBUS_H */ -- cgit v1.2.3 From 5da3bfa029d6809e192d112f39fca4dbe0137aaf Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 14 May 2025 17:19:34 -0700 Subject: cgroup: use separate rstat trees for each subsystem Different subsystems may call cgroup_rstat_updated() within the same cgroup, resulting in a tree of pending updates from multiple subsystems. When one of these subsystems is flushed via cgroup_rstat_flushed(), all other subsystems with pending updates on the tree will also be flushed. Change the paradigm of having a single rstat tree for all subsystems to having separate trees for each subsystem. This separation allows for subsystems to perform flushes without the side effects of other subsystems. As an example, flushing the cpu stats will no longer cause the memory stats to be flushed and vice versa. In order to achieve subsystem-specific trees, change the tree node type from cgroup to cgroup_subsys_state pointer. Then remove those pointers from the cgroup and instead place them on the css. Finally, change update/flush functions to make use of the different node type (css). These changes allow a specific subsystem to be associated with an update or flush. Separate rstat trees will now exist for each unique subsystem. Since updating/flushing will now be done at the subsystem level, there is no longer a need to keep track of updated css nodes at the cgroup level. The list management of these nodes done within the cgroup (rstat_css_list and related) has been removed accordingly. Conditional guards for checking validity of a given css were placed within css_rstat_updated/flush() to prevent undefined behavior occuring from kfunc usage in bpf programs. Guards were also placed within css_rstat_init/exit() in order to help consolidate calls to them. At call sites for all four functions, the existing guards were removed. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 46 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e58bfb880111..17ecaae9c5f8 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -169,6 +169,8 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; + struct css_rstat_cpu __percpu *rstat_cpu; + /* * siblings list anchored at the parent's ->children * @@ -177,9 +179,6 @@ struct cgroup_subsys_state { struct list_head sibling; struct list_head children; - /* flush target list anchored at cgrp->rstat_css_list */ - struct list_head rstat_css_node; - /* * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). @@ -219,6 +218,13 @@ struct cgroup_subsys_state { * Protected by cgroup_mutex. */ int nr_descendants; + + /* + * A singly-linked list of css structures to be rstat flushed. + * This is a scratch field to be used exclusively by + * css_rstat_flush() and protected by cgroup_rstat_lock. + */ + struct cgroup_subsys_state *rstat_flush_next; }; /* @@ -329,10 +335,10 @@ struct cgroup_base_stat { /* * rstat - cgroup scalable recursive statistics. Accounting is done - * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the + * per-cpu in css_rstat_cpu which is then lazily propagated up the * hierarchy on reads. * - * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are + * When a stat gets updated, the css_rstat_cpu and its ancestors are * linked into the updated tree. On the following read, propagation only * considers and consumes the updated tree. This makes reading O(the * number of descendants which have been active since last read) instead of @@ -346,20 +352,20 @@ struct cgroup_base_stat { * This struct hosts both the fields which implement the above - * updated_children and updated_next. */ -struct cgroup_rstat_cpu { +struct css_rstat_cpu { /* * Child cgroups with stat updates on this cpu since the last read * are linked on the parent's ->updated_children through - * ->updated_next. + * ->updated_next. updated_children is terminated by its container css. * - * In addition to being more compact, singly-linked list pointing - * to the cgroup makes it unnecessary for each per-cpu struct to - * point back to the associated cgroup. + * In addition to being more compact, singly-linked list pointing to + * the css makes it unnecessary for each per-cpu struct to point back + * to the associated css. * * Protected by per-cpu cgroup_rstat_cpu_lock. */ - struct cgroup *updated_children; /* terminated by self cgroup */ - struct cgroup *updated_next; /* NULL iff not on the list */ + struct cgroup_subsys_state *updated_children; + struct cgroup_subsys_state *updated_next; /* NULL if not on the list */ }; /* @@ -521,25 +527,15 @@ struct cgroup { struct cgroup *dom_cgrp; struct cgroup *old_dom_cgrp; /* used while enabling threaded */ - /* per-cpu recursive resource statistics */ - struct cgroup_rstat_cpu __percpu *rstat_cpu; struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu; - struct list_head rstat_css_list; /* - * Add padding to separate the read mostly rstat_cpu and - * rstat_css_list into a different cacheline from the following - * rstat_flush_next and *bstat fields which can have frequent updates. + * Add padding to keep the read mostly rstat per-cpu pointer on a + * different cacheline than the following *bstat fields which can have + * frequent updates. */ CACHELINE_PADDING(_pad_); - /* - * A singly-linked list of cgroup structures to be rstat flushed. - * This is a scratch field to be used exclusively by - * css_rstat_flush_locked() and protected by cgroup_rstat_lock. - */ - struct cgroup *rstat_flush_next; - /* cgroup basic resource statistics */ struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; -- cgit v1.2.3 From 748922dcfabdd655d25fb6dd09a60e694a3d35e6 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 14 May 2025 17:19:35 -0700 Subject: cgroup: use subsystem-specific rstat locks to avoid contention It is possible to eliminate contention between subsystems when updating/flushing stats by using subsystem-specific locks. Let the existing rstat locks be dedicated to the cgroup base stats and rename them to reflect that. Add similar locks to the cgroup_subsys struct for use with individual subsystems. Lock initialization is done in the new function ss_rstat_init(ss) which replaces cgroup_rstat_boot(void). If NULL is passed to this function, the global base stat locks will be initialized. Otherwise, the subsystem locks will be initialized. Change the existing lock helper functions to accept a reference to a css. Then within these functions, conditionally select the appropriate locks based on the subsystem affiliation of the given css. Add helper functions for this selection routine to avoid repeated code. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 17ecaae9c5f8..5b8127d29dc5 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -222,7 +222,10 @@ struct cgroup_subsys_state { /* * A singly-linked list of css structures to be rstat flushed. * This is a scratch field to be used exclusively by - * css_rstat_flush() and protected by cgroup_rstat_lock. + * css_rstat_flush(). + * + * Protected by rstat_base_lock when css is cgroup::self. + * Protected by css->ss->rstat_ss_lock otherwise. */ struct cgroup_subsys_state *rstat_flush_next; }; @@ -362,7 +365,7 @@ struct css_rstat_cpu { * the css makes it unnecessary for each per-cpu struct to point back * to the associated css. * - * Protected by per-cpu cgroup_rstat_cpu_lock. + * Protected by per-cpu css->ss->rstat_ss_cpu_lock. */ struct cgroup_subsys_state *updated_children; struct cgroup_subsys_state *updated_next; /* NULL if not on the list */ @@ -792,6 +795,9 @@ struct cgroup_subsys { * specifies the mask of subsystems that this one depends on. */ unsigned int depends_on; + + spinlock_t rstat_ss_lock; + raw_spinlock_t __percpu *rstat_ss_cpu_lock; }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; -- cgit v1.2.3 From f3921fb7fdc23dd946b0c082f5fedca9ce75d506 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 14 May 2025 17:19:37 -0700 Subject: cgroup: document the rstat per-cpu initialization The calls to css_rstat_init() occur at different places depending on the context. Document the conditions that determine which point of initialization is used. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5b8127d29dc5..e61687d5e496 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -169,6 +169,21 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; + /* + * Depending on the context, this field is initialized + * via css_rstat_init() at different places: + * + * when css is associated with cgroup::self + * when css->cgroup is the root cgroup + * performed in cgroup_init() + * when css->cgroup is not the root cgroup + * performed in cgroup_create() + * when css is associated with a subsystem + * when css->cgroup is the root cgroup + * performed in cgroup_init_subsys() in the non-early path + * when css->cgroup is not the root cgroup + * performed in css_create() + */ struct css_rstat_cpu __percpu *rstat_cpu; /* @@ -530,6 +545,15 @@ struct cgroup { struct cgroup *dom_cgrp; struct cgroup *old_dom_cgrp; /* used while enabling threaded */ + /* + * Depending on the context, this field is initialized via + * css_rstat_init() at different places: + * + * when cgroup is the root cgroup + * performed in cgroup_setup_root() + * otherwise + * performed in cgroup_create() + */ struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu; /* -- cgit v1.2.3 From 22f2dc03553fb3f407fbd5e6b5b4f9c747927cb2 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 19 May 2025 11:08:51 +0300 Subject: PCI: Add Intel Wildcat Lake audio Device ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Wildcat Lake (WCL) audio Device ID. Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Reviewed-by: Guennadi Liakhovetski Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Acked-by: Krzysztof Wilczyński Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250519080855.16977-2-peter.ujfalusi@linux.intel.com --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 2e28182c3af0..f7ceefce6d3d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3049,6 +3049,7 @@ #define PCI_DEVICE_ID_INTEL_HDA_DG1 0x490d #define PCI_DEVICE_ID_INTEL_HDA_EHL_0 0x4b55 #define PCI_DEVICE_ID_INTEL_HDA_EHL_3 0x4b58 +#define PCI_DEVICE_ID_INTEL_HDA_WCL 0x4d28 #define PCI_DEVICE_ID_INTEL_HDA_JSL_N 0x4dc8 #define PCI_DEVICE_ID_INTEL_HDA_DG2_0 0x4f90 #define PCI_DEVICE_ID_INTEL_HDA_DG2_1 0x4f91 -- cgit v1.2.3 From 58f5fbeb367ff6f30a2448b2cad70f70b2de4b06 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 16 May 2025 21:28:03 +0200 Subject: fanotify: support watching filesystems and mounts inside userns An unprivileged user is allowed to create an fanotify group and add inode marks, but not filesystem, mntns and mount marks. Add limited support for setting up filesystem, mntns and mount marks by an unprivileged user under the following conditions: 1. User has CAP_SYS_ADMIN in the user ns where the group was created 2.a. User has CAP_SYS_ADMIN in the user ns where the sb was created OR (in case setting up a mntns mark) 2.b. User has CAP_SYS_ADMIN in the user ns associated with the mntns Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/20250516192803.838659-3-amir73il@gmail.com --- include/linux/fanotify.h | 5 ++--- include/linux/fsnotify_backend.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 3c817dc6292e..879cff5eccd4 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -38,8 +38,7 @@ FAN_REPORT_PIDFD | \ FAN_REPORT_FD_ERROR | \ FAN_UNLIMITED_QUEUE | \ - FAN_UNLIMITED_MARKS | \ - FAN_REPORT_MNT) + FAN_UNLIMITED_MARKS) /* * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN. @@ -48,7 +47,7 @@ * so one of the flags for reporting file handles is required. */ #define FANOTIFY_USER_INIT_FLAGS (FAN_CLASS_NOTIF | \ - FANOTIFY_FID_BITS | \ + FANOTIFY_FID_BITS | FAN_REPORT_MNT | \ FAN_CLOEXEC | FAN_NONBLOCK) #define FANOTIFY_INIT_FLAGS (FANOTIFY_ADMIN_INIT_FLAGS | \ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 6cd8d1d28b8b..7dd22db06317 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -250,6 +250,7 @@ struct fsnotify_group { * full */ struct mem_cgroup *memcg; /* memcg to charge allocations */ + struct user_namespace *user_ns; /* user ns where group was created */ /* groups can define private fields here or use the void *private */ union { -- cgit v1.2.3 From a462903fa22541f212134fba81084315ad843e6e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 16 May 2025 13:59:27 +0200 Subject: net: netlink: reduce extack cookie size Seems like the extack cookie hasn't found any users outside of wireless, which always uses nl_set_extack_cookie_u64(). Thus, allocating 20 bytes for it is pointless, reduce that to 8 bytes, and add a BUILD_BUG_ON() to ensure it's enough (obviously it is, for a u64, but in case it changes again.) Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20250516115927.38209-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index c3ae84a77e16..882e9c1b6c1d 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -63,7 +63,7 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) } /* this can be increased when necessary - don't expose to userland */ -#define NETLINK_MAX_COOKIE_LEN 20 +#define NETLINK_MAX_COOKIE_LEN 8 #define NETLINK_MAX_FMTMSG_LEN 80 /** @@ -212,6 +212,7 @@ static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, { if (!extack) return; + BUILD_BUG_ON(sizeof(extack->cookie) < sizeof(cookie)); memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } -- cgit v1.2.3 From 1c9a93bf1d01729c54e9acbaa538db81f16563b2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 25 Apr 2025 20:06:34 -0600 Subject: dmapool: add NUMA affinity support Introduce dma_pool_create_node(), like dma_pool_create() but taking an additional NUMA node argument. Allocate struct dma_pool on the desired node, and store the node on dma_pool for allocating struct dma_page. Make dma_pool_create() an alias for dma_pool_create_node() with node set to NUMA_NO_NODE. Signed-off-by: Keith Busch Signed-off-by: Caleb Sander Mateos Reviewed-by: Jens Axboe Reviewed-by: John Garry Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- include/linux/dmapool.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h index f632ecfb4238..06c4de602b2f 100644 --- a/include/linux/dmapool.h +++ b/include/linux/dmapool.h @@ -11,6 +11,7 @@ #ifndef LINUX_DMAPOOL_H #define LINUX_DMAPOOL_H +#include #include #include @@ -18,8 +19,8 @@ struct device; #ifdef CONFIG_HAS_DMA -struct dma_pool *dma_pool_create(const char *name, struct device *dev, - size_t size, size_t align, size_t allocation); +struct dma_pool *dma_pool_create_node(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary, int node); void dma_pool_destroy(struct dma_pool *pool); @@ -35,9 +36,12 @@ struct dma_pool *dmam_pool_create(const char *name, struct device *dev, void dmam_pool_destroy(struct dma_pool *pool); #else /* !CONFIG_HAS_DMA */ -static inline struct dma_pool *dma_pool_create(const char *name, - struct device *dev, size_t size, size_t align, size_t allocation) -{ return NULL; } +static inline struct dma_pool *dma_pool_create_node(const char *name, + struct device *dev, size_t size, size_t align, size_t boundary, + int node) +{ + return NULL; +} static inline void dma_pool_destroy(struct dma_pool *pool) { } static inline void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle) { return NULL; } @@ -49,6 +53,13 @@ static inline struct dma_pool *dmam_pool_create(const char *name, static inline void dmam_pool_destroy(struct dma_pool *pool) { } #endif /* !CONFIG_HAS_DMA */ +static inline struct dma_pool *dma_pool_create(const char *name, + struct device *dev, size_t size, size_t align, size_t boundary) +{ + return dma_pool_create_node(name, dev, size, align, boundary, + NUMA_NO_NODE); +} + static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle) { -- cgit v1.2.3 From 2695b3c7fe4fa06a377ea0d66e3fe5fdb60310f0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 11 May 2025 11:28:36 -0700 Subject: MIPS: bcm63xx: nvram: avoid inefficient use of crc32_le_combine() bcm963xx_nvram_checksum() was using crc32_le_combine() to update a CRC with four zero bytes. However, this is about 5x slower than just CRC'ing four zero bytes in the normal way. Just do that instead. (We could instead make crc32_le_combine() faster on short lengths. But all its callers do just fine without it, so I'd like to just remove it.) Signed-off-by: Eric Biggers Signed-off-by: Thomas Bogendoerfer --- include/linux/bcm963xx_nvram.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bcm963xx_nvram.h b/include/linux/bcm963xx_nvram.h index c8c7f01159fe..48830bf18042 100644 --- a/include/linux/bcm963xx_nvram.h +++ b/include/linux/bcm963xx_nvram.h @@ -81,25 +81,21 @@ static int __maybe_unused bcm963xx_nvram_checksum( const struct bcm963xx_nvram *nvram, u32 *expected_out, u32 *actual_out) { + const u32 zero = 0; u32 expected, actual; size_t len; if (nvram->version <= 4) { expected = nvram->checksum_v4; - len = BCM963XX_NVRAM_V4_SIZE - sizeof(u32); + len = BCM963XX_NVRAM_V4_SIZE; } else { expected = nvram->checksum_v5; - len = BCM963XX_NVRAM_V5_SIZE - sizeof(u32); + len = BCM963XX_NVRAM_V5_SIZE; } - /* - * Calculate the CRC32 value for the nvram with a checksum value - * of 0 without modifying or copying the nvram by combining: - * - The CRC32 of the nvram without the checksum value - * - The CRC32 of a zero checksum value (which is also 0) - */ - actual = crc32_le_combine( - crc32_le(~0, (u8 *)nvram, len), 0, sizeof(u32)); + /* Calculate the CRC32 of the nvram with the checksum field set to 0. */ + actual = crc32_le(~0, nvram, len - sizeof(u32)); + actual = crc32_le(actual, &zero, sizeof(u32)); if (expected_out) *expected_out = expected; -- cgit v1.2.3 From 31be641d74267d98317ef5a2b90e6200511cabb3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 15 May 2025 10:11:54 +0200 Subject: net: phy: make mdio consumer / device layer a separate module After having factored out the provider part from mdio_bus.c, we can make the mdio consumer / device layer a separate module. This also allows to remove Kconfig symbol MDIO_DEVICE. The module init / exit functions from mdio_bus.c no longer have to be called from phy_device.c. The link order defined in drivers/net/phy/Makefile ensures that init / exit functions are called in the right order. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/dba6b156-5748-44ce-b5e2-e8dc2fcee5a7@gmail.com Signed-off-by: Paolo Abeni --- include/linux/phy.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7c29d346d4b3..92a88b5ce356 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2033,9 +2033,6 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); -int __init mdio_bus_init(void); -void mdio_bus_exit(void); - int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, -- cgit v1.2.3 From 3318f7b5cefbff96b1bb49584ac38d2c9997a830 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 19 May 2025 10:51:28 -0700 Subject: iommu/io-pgtable-arm: Add quirk to quiet WARN_ON() In situations where mapping/unmapping sequence can be controlled by userspace, attempting to map over a region that has not yet been unmapped is an error. But not something that should spam dmesg. Now that there is a quirk, we can also drop the selftest_running flag, and use the quirk instead for selftests. Acked-by: Robin Murphy Signed-off-by: Rob Clark Link: https://lore.kernel.org/r/20250519175348.11924-6-robdclark@gmail.com [will: Rename quirk to IO_PGTABLE_QUIRK_NO_WARN per Robin's suggestion] Signed-off-by: Will Deacon --- include/linux/io-pgtable.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index bba2a51c87d2..138fbd89b1e6 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -88,6 +88,13 @@ struct io_pgtable_cfg { * * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable. * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits + * + * IO_PGTABLE_QUIRK_NO_WARN: Do not WARN_ON() on conflicting + * mappings, but silently return -EEXISTS. Normally an attempt + * to map over an existing mapping would indicate some sort of + * kernel bug, which would justify the WARN_ON(). But for GPU + * drivers, this could be under control of userspace. Which + * deserves an error return, but not to spam dmesg. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -97,6 +104,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) #define IO_PGTABLE_QUIRK_ARM_HD BIT(7) #define IO_PGTABLE_QUIRK_ARM_S2FWB BIT(8) + #define IO_PGTABLE_QUIRK_NO_WARN BIT(9) unsigned long quirks; unsigned long pgsize_bitmap; unsigned int ias; -- cgit v1.2.3 From d6bf294773a472fe4694b92714af482f5c6aa4c6 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 12 May 2025 14:33:15 +0800 Subject: ext4/jbd2: convert jbd2_journal_blocks_per_page() to support large folio jbd2_journal_blocks_per_page() returns the number of blocks in a single page. Rename it to jbd2_journal_blocks_per_folio() and make it returns the number of blocks in the largest folio, preparing for the calculation of journal credits blocks when allocating blocks within a large folio in the writeback path. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250512063319.3539411-5-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 023e8abdb99a..ebbcdab474d5 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1723,7 +1723,7 @@ static inline int tid_geq(tid_t x, tid_t y) return (difference >= 0); } -extern int jbd2_journal_blocks_per_page(struct inode *inode); +extern int jbd2_journal_blocks_per_folio(struct inode *inode); extern size_t journal_tag_bytes(journal_t *journal); static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) -- cgit v1.2.3 From 76005718cf8bfdb6b0f818ea75ca6a4b3bee34cd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 22:38:08 -0700 Subject: jbd2: remove journal_t argument from jbd2_chksum() Since jbd2_chksum() no longer uses its journal_t argument, remove it. Signed-off-by: Eric Biggers Reviewed-by: Baokun Li Link: https://patch.msgid.link/20250513053809.699974-4-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ebbcdab474d5..43b9297fe8a7 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1766,8 +1766,7 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 -static inline u32 jbd2_chksum(journal_t *journal, u32 crc, - const void *address, unsigned int length) +static inline u32 jbd2_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } -- cgit v1.2.3 From 7e6f4a0a7512eca93cecff3bcb37f0ed164949a2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 19 May 2025 13:13:14 +0200 Subject: i2c: remove 'of_node' member from i2c_boardinfo There is no user of this member anymore. We can remove it. Reviewed-by: Andi Shyti Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index cc1437f29823..20fd41b51d5c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -405,7 +405,6 @@ static inline bool i2c_detect_slave_mode(struct device *dev) { return false; } * @addr: stored in i2c_client.addr * @dev_name: Overrides the default - dev_name if set * @platform_data: stored in i2c_client.dev.platform_data - * @of_node: **DEPRECATED** - use @fwnode for this * @fwnode: device node supplied by the platform firmware * @swnode: software node for the device * @resources: resources associated with the device @@ -429,7 +428,6 @@ struct i2c_board_info { unsigned short addr; const char *dev_name; void *platform_data; - struct device_node *of_node; struct fwnode_handle *fwnode; const struct software_node *swnode; const struct resource *resources; -- cgit v1.2.3 From 3f1716ee0f6c63795e6d225e3f5ec3825cd2bd57 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 17 May 2025 22:34:32 +0200 Subject: net: phy: fixed_phy: remove irq argument from fixed_phy_add All callers pass PHY_POLL, therefore remove irq argument from fixed_phy_add(). Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Acked-by: Greg Ungerer Link: https://patch.msgid.link/b3b9b3bc-c310-4a54-b376-c909c83575de@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 3392c09b5d24..316bb4deda37 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,8 +17,7 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -extern int fixed_phy_add(unsigned int irq, int phy_id, - struct fixed_phy_status *status); +int fixed_phy_add(int phy_id, struct fixed_phy_status *status); extern struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, struct device_node *np); @@ -28,7 +27,7 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, struct fixed_phy_status *)); #else -static inline int fixed_phy_add(unsigned int irq, int phy_id, +static inline int fixed_phy_add(int phy_id, struct fixed_phy_status *status) { return -ENODEV; -- cgit v1.2.3 From d23b4af5df3900fb0b4e1a05cb8119dd1c395519 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 17 May 2025 22:35:56 +0200 Subject: net: phy: fixed_phy: remove irq argument from fixed_phy_register All callers pass PHY_POLL, therefore remove irq argument from fixed_phy_register(). Note: I keep the irq argument in fixed_phy_add_gpiod() for now, for the case that somebody may want to use a GPIO interrupt in the future, by e.g. adding a call to fwnode_irq_get() to fixed_phy_get_gpiod(). Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/31cdb232-a5e9-4997-a285-cb9a7d208124@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 316bb4deda37..634149a73c2a 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -18,9 +18,8 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); int fixed_phy_add(int phy_id, struct fixed_phy_status *status); -extern struct phy_device *fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np); +struct phy_device *fixed_phy_register(struct fixed_phy_status *status, + struct device_node *np); extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, @@ -32,9 +31,9 @@ static inline int fixed_phy_add(int phy_id, { return -ENODEV; } -static inline struct phy_device *fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np) +static inline struct phy_device * +fixed_phy_register(struct fixed_phy_status *status, + struct device_node *np) { return ERR_PTR(-ENODEV); } -- cgit v1.2.3 From 4ba1c5bb4811f560a86697311cb4e9741e047a5d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 17 May 2025 22:37:29 +0200 Subject: net: phy: fixed_phy: constify status argument where possible Constify the passed struct fixed_phy_status *status where possible. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/d1764b62-8538-408b-a4e3-b63715481a38@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 634149a73c2a..5399b9e41e35 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,8 +17,8 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -int fixed_phy_add(int phy_id, struct fixed_phy_status *status); -struct phy_device *fixed_phy_register(struct fixed_phy_status *status, +int fixed_phy_add(int phy_id, const struct fixed_phy_status *status); +struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np); extern void fixed_phy_unregister(struct phy_device *phydev); @@ -27,12 +27,12 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, struct fixed_phy_status *)); #else static inline int fixed_phy_add(int phy_id, - struct fixed_phy_status *status) + const struct fixed_phy_status *status) { return -ENODEV; } static inline struct phy_device * -fixed_phy_register(struct fixed_phy_status *status, +fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From 7190b3c8bd2b0cde483bd440cf91ba1c518b4261 Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 7 May 2025 15:28:06 +0200 Subject: mm: mmap: map MAP_STACK to VM_NOHUGEPAGE only if THP is enabled commit c4608d1bf7c6 ("mm: mmap: map MAP_STACK to VM_NOHUGEPAGE") maps the mmap option MAP_STACK to VM_NOHUGEPAGE. This is also done if CONFIG_TRANSPARENT_HUGEPAGE is not defined. But in that case, the VM_NOHUGEPAGE does not make sense. I discovered this issue when trying to use the tool CRIU to checkpoint and restore a container. Our running kernel is compiled without CONFIG_TRANSPARENT_HUGEPAGE. CRIU parses the output of /proc//smaps and saves the "nh" flag. When trying to restore the container, CRIU fails to restore the "nh" mappings, since madvise() MADV_NOHUGEPAGE always returns an error because CONFIG_TRANSPARENT_HUGEPAGE is not defined. Link: https://lkml.kernel.org/r/20250507-map-map_stack-to-vm_nohugepage-only-if-thp-is-enabled-v5-1-c6c38cfefd6e@kuka.com Fixes: c4608d1bf7c6 ("mm: mmap: map MAP_STACK to VM_NOHUGEPAGE") Signed-off-by: Ignacio Moreno Gonzalez Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Yang Shi Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- include/linux/mman.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mman.h b/include/linux/mman.h index bce214fece16..f4c6346a8fcd 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -155,7 +155,9 @@ calc_vm_flag_bits(struct file *file, unsigned long flags) return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | +#ifdef CONFIG_TRANSPARENT_HUGEPAGE _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) | +#endif arch_calc_vm_flag_bits(file, flags); } -- cgit v1.2.3 From 0f518255bde881d2a2605bbc080b438b532b6ab2 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 7 May 2025 15:09:57 +0200 Subject: mm: fix VM_UFFD_MINOR == VM_SHADOW_STACK on USERFAULTFD=y && ARM64_GCS=y On configs with CONFIG_ARM64_GCS=y, VM_SHADOW_STACK is bit 38. On configs with CONFIG_HAVE_ARCH_USERFAULTFD_MINOR=y (selected by CONFIG_ARM64 when CONFIG_USERFAULTFD=y), VM_UFFD_MINOR is _also_ bit 38. This bit being shared by two different VMA flags could lead to all sorts of unintended behaviors. Presumably, a process could maybe call into userfaultfd in a way that disables the shadow stack vma flag. I can't think of any attack where this would help (presumably, if an attacker tries to disable shadow stacks, they are trying to hijack control flow so can't arbitrarily call into userfaultfd yet anyway) but this still feels somewhat scary. Link: https://lkml.kernel.org/r/20250507131000.1204175-2-revest@chromium.org Fixes: ae80e1629aea ("mm: Define VM_SHADOW_STACK for arm64 when we support GCS") Signed-off-by: Florent Revest Reviewed-by: Mark Brown Cc: Borislav Betkov Cc: Brendan Jackman Cc: Catalin Marinas Cc: Florent Revest Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thiago Jung Bauermann Cc: Thomas Gleinxer Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index bf55206935c4..fdda6b16263b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -385,7 +385,7 @@ extern unsigned int kobjsize(const void *objp); #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR -# define VM_UFFD_MINOR_BIT 38 +# define VM_UFFD_MINOR_BIT 41 # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ # define VM_UFFD_MINOR VM_NONE -- cgit v1.2.3 From 97dfbbd135cb5e4426f37ca53a8fa87eaaa4e376 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 14 May 2025 18:06:02 +0100 Subject: highmem: add folio_test_partial_kmap() In commit c749d9b7ebbc ("iov_iter: fix copy_page_from_iter_atomic() if KMAP_LOCAL_FORCE_MAP"), Hugh correctly noted that if KMAP_LOCAL_FORCE_MAP is enabled, we must limit ourselves to PAGE_SIZE bytes per call to kmap_local(). The same problem exists in memcpy_from_folio(), memcpy_to_folio(), folio_zero_tail(), folio_fill_tail() and memcpy_from_file_folio(), so add folio_test_partial_kmap() to do this more succinctly. Link: https://lkml.kernel.org/r/20250514170607.3000994-2-willy@infradead.org Fixes: 00cdf76012ab ("mm: add memcpy_from_file_folio()") Signed-off-by: Matthew Wilcox (Oracle) Cc: Al Viro Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- include/linux/highmem.h | 10 +++++----- include/linux/page-flags.h | 7 +++++++ 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 5c6bea81a90e..c698f8415675 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -461,7 +461,7 @@ static inline void memcpy_from_folio(char *to, struct folio *folio, const char *from = kmap_local_folio(folio, offset); size_t chunk = len; - if (folio_test_highmem(folio) && + if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); @@ -489,7 +489,7 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset, char *to = kmap_local_folio(folio, offset); size_t chunk = len; - if (folio_test_highmem(folio) && + if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); @@ -522,7 +522,7 @@ static inline __must_check void *folio_zero_tail(struct folio *folio, { size_t len = folio_size(folio) - offset; - if (folio_test_highmem(folio)) { + if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { @@ -560,7 +560,7 @@ static inline void folio_fill_tail(struct folio *folio, size_t offset, VM_BUG_ON(offset + len > folio_size(folio)); - if (folio_test_highmem(folio)) { + if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { @@ -597,7 +597,7 @@ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); - if (folio_test_highmem(folio)) { + if (folio_test_partial_kmap(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e6a21b62dcce..3b814ce08331 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -615,6 +615,13 @@ FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) PAGEFLAG_FALSE(HighMem, highmem) #endif +/* Does kmap_local_folio() only allow access to one page of the folio? */ +#ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP +#define folio_test_partial_kmap(f) true +#else +#define folio_test_partial_kmap(f) folio_test_highmem(f) +#endif + #ifdef CONFIG_SWAP static __always_inline bool folio_test_swapcache(const struct folio *folio) { -- cgit v1.2.3 From 223657cfc87c3d5b9c2172014181c8ed7acf58e8 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Tue, 20 May 2025 15:28:26 +0200 Subject: pinctrl: remove extern specifier for functions in machine.h Extern is the default specifier for a function, no need to define it. Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/20250520-aaeon-up-board-pinctrl-support-v6-2-dcb3756be3c6@bootlin.com Signed-off-by: Linus Walleij --- include/linux/pinctrl/machine.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h index 673e96df453b..0940fabb154d 100644 --- a/include/linux/pinctrl/machine.h +++ b/include/linux/pinctrl/machine.h @@ -153,10 +153,10 @@ struct pinctrl_map; #ifdef CONFIG_PINCTRL -extern int pinctrl_register_mappings(const struct pinctrl_map *map, - unsigned int num_maps); -extern void pinctrl_unregister_mappings(const struct pinctrl_map *map); -extern void pinctrl_provide_dummies(void); +int pinctrl_register_mappings(const struct pinctrl_map *map, + unsigned int num_maps); +void pinctrl_unregister_mappings(const struct pinctrl_map *map); +void pinctrl_provide_dummies(void); #else static inline int pinctrl_register_mappings(const struct pinctrl_map *map, -- cgit v1.2.3 From 2e9ba1d9a31f68b4deb5f9e62a9201a51b00abf7 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Tue, 20 May 2025 15:28:27 +0200 Subject: pinctrl: core: add devm_pinctrl_register_mappings() Using devm_pinctrl_register_mappings(), the core can automatically unregister pinctrl mappings. Reviewed-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/20250520-aaeon-up-board-pinctrl-support-v6-3-dcb3756be3c6@bootlin.com Signed-off-by: Linus Walleij --- include/linux/pinctrl/machine.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h index 0940fabb154d..25620229b1d6 100644 --- a/include/linux/pinctrl/machine.h +++ b/include/linux/pinctrl/machine.h @@ -149,12 +149,16 @@ struct pinctrl_map { #define PIN_MAP_CONFIGS_GROUP_HOG_DEFAULT(dev, grp, cfgs) \ PIN_MAP_CONFIGS_GROUP(dev, PINCTRL_STATE_DEFAULT, dev, grp, cfgs) +struct device; struct pinctrl_map; #ifdef CONFIG_PINCTRL int pinctrl_register_mappings(const struct pinctrl_map *map, unsigned int num_maps); +int devm_pinctrl_register_mappings(struct device *dev, + const struct pinctrl_map *map, + unsigned int num_maps); void pinctrl_unregister_mappings(const struct pinctrl_map *map); void pinctrl_provide_dummies(void); #else @@ -165,6 +169,13 @@ static inline int pinctrl_register_mappings(const struct pinctrl_map *map, return 0; } +static inline int devm_pinctrl_register_mappings(struct device *dev, + const struct pinctrl_map *map, + unsigned int num_maps) +{ + return 0; +} + static inline void pinctrl_unregister_mappings(const struct pinctrl_map *map) { } -- cgit v1.2.3 From 70892277ca2dbad30ce89acf62fb62045d4bc59b Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 20 May 2025 15:08:56 -0400 Subject: iommu/arm-smmu-qcom: Make set_stall work when the device is on Up until now we have only called the set_stall callback during initialization when the device is off. But we will soon start calling it to temporarily disable stall-on-fault when the device is on, so handle that by checking if the device is on and writing SCTLR. Signed-off-by: Connor Abbott Reviewed-by: Rob Clark Link: https://lore.kernel.org/r/20250520-msm-gpu-fault-fixes-next-v8-3-fce6ee218787@gmail.com [will: Fix "mixed declarations and code" warning from sparse] Signed-off-by: Will Deacon --- include/linux/adreno-smmu-priv.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/adreno-smmu-priv.h b/include/linux/adreno-smmu-priv.h index abec23c7744f..d83c9175828f 100644 --- a/include/linux/adreno-smmu-priv.h +++ b/include/linux/adreno-smmu-priv.h @@ -45,9 +45,9 @@ struct adreno_smmu_fault_info { * TTBR0 translation is enabled with the specified cfg * @get_fault_info: Called by the GPU fault handler to get information about * the fault - * @set_stall: Configure whether stall on fault (CFCFG) is enabled. Call - * before set_ttbr0_cfg(). If stalling on fault is enabled, - * the GPU driver must call resume_translation() + * @set_stall: Configure whether stall on fault (CFCFG) is enabled. If + * stalling on fault is enabled, the GPU driver must call + * resume_translation() * @resume_translation: Resume translation after a fault * * @set_prr_bit: [optional] Configure the GPU's Partially Resident -- cgit v1.2.3 From fe26933cf1e151ce0a5d858c263e8dacb2f12cee Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 7 May 2025 10:13:21 -0400 Subject: vt: add ucs_get_fallback() This is the code querying the newly introduced tables. Signed-off-by: Nicolas Pitre Link: https://lore.kernel.org/r/20250507141535.40655-7-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/consolemap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h index 8167494229db..6180b803795c 100644 --- a/include/linux/consolemap.h +++ b/include/linux/consolemap.h @@ -31,6 +31,7 @@ void console_map_init(void); bool ucs_is_double_width(uint32_t cp); bool ucs_is_zero_width(uint32_t cp); u32 ucs_recompose(u32 base, u32 mark); +u32 ucs_get_fallback(u32 cp); #else static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) @@ -75,6 +76,11 @@ static inline u32 ucs_recompose(u32 base, u32 mark) { return 0; } + +static inline u32 ucs_get_fallback(u32 cp) +{ + return 0; +} #endif /* CONFIG_CONSOLE_TRANSLATIONS */ #endif /* __LINUX_CONSOLEMAP_H__ */ -- cgit v1.2.3 From 80fa7a03378588582eb40f89b6f418c0c256cf24 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 20 May 2025 13:16:43 -0400 Subject: vt: bracketed paste support This is comprised of 3 aspects: - Take note of when applications advertise bracketed paste support via "\e[?2004h" and "\e[?2004l". - Insert bracketed paste markers ("\e[200~" and "\e[201~") around pasted content in paste_selection() when bracketed paste is active. - Add TIOCL_GETBRACKETEDPASTE to return bracketed paste status so user space daemons implementing cut-and-paste functionality (e.g. gpm, BRLTTY) may know when to insert bracketed paste markers. Link: https://en.wikipedia.org/wiki/Bracketed-paste Signed-off-by: Nicolas Pitre Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/20250520171851.1219676-2-nico@fluxnic.net Signed-off-by: Greg Kroah-Hartman --- include/linux/console_struct.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 20f564e98552..59b4fec5f254 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -145,6 +145,7 @@ struct vc_data { unsigned int vc_need_wrap : 1; unsigned int vc_can_do_color : 1; unsigned int vc_report_mouse : 2; + unsigned int vc_bracketed_paste : 1; unsigned char vc_utf : 1; /* Unicode UTF-8 encoding */ unsigned char vc_utf_count; int vc_utf_char; -- cgit v1.2.3 From 279f2c2c8e2169403d01190f042efa6e41731578 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 17 May 2025 17:14:53 +0200 Subject: futex: Use RCU_INIT_POINTER() in futex_mm_init(). There is no need for an explicit NULL pointer initialisation plus a comment why it is okay. RCU_INIT_POINTER() can be used for NULL initialisations and it is documented. This has been build tested with gcc version 9.3.0 (Debian 9.3.0-22) on a x86-64 defconfig. Fixes: 094ac8cff7858 ("futex: Relax the rcu_assign_pointer() assignment of mm->futex_phash in futex_mm_init()") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250517151455.1065363-4-bigeasy@linutronix.de --- include/linux/futex.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/futex.h b/include/linux/futex.h index 168ffd5996b4..005b040c4791 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -88,14 +88,7 @@ void futex_hash_free(struct mm_struct *mm); static inline void futex_mm_init(struct mm_struct *mm) { - /* - * No need for rcu_assign_pointer() here, as we can rely on - * tasklist_lock write-ordering in copy_process(), before - * the task's MM becomes visible and the ->futex_phash - * becomes externally observable: - */ - mm->futex_phash = NULL; - + RCU_INIT_POINTER(mm->futex_phash, NULL); mutex_init(&mm->futex_hash_lock); } -- cgit v1.2.3 From a9194f88782afa1386641451a6c76beaa60485a0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 16 May 2025 13:25:31 +0200 Subject: coredump: add coredump socket Coredumping currently supports two modes: (1) Dumping directly into a file somewhere on the filesystem. (2) Dumping into a pipe connected to a usermode helper process spawned as a child of the system_unbound_wq or kthreadd. For simplicity I'm mostly ignoring (1). There's probably still some users of (1) out there but processing coredumps in this way can be considered adventurous especially in the face of set*id binaries. The most common option should be (2) by now. It works by allowing userspace to put a string into /proc/sys/kernel/core_pattern like: |/usr/lib/systemd/systemd-coredump %P %u %g %s %t %c %h The "|" at the beginning indicates to the kernel that a pipe must be used. The path following the pipe indicator is a path to a binary that will be spawned as a usermode helper process. Any additional parameters pass information about the task that is generating the coredump to the binary that processes the coredump. In the example core_pattern shown above systemd-coredump is spawned as a usermode helper. There's various conceptual consequences of this (non-exhaustive list): - systemd-coredump is spawned with file descriptor number 0 (stdin) connected to the read-end of the pipe. All other file descriptors are closed. That specifically includes 1 (stdout) and 2 (stderr). This has already caused bugs because userspace assumed that this cannot happen (Whether or not this is a sane assumption is irrelevant.). - systemd-coredump will be spawned as a child of system_unbound_wq. So it is not a child of any userspace process and specifically not a child of PID 1. It cannot be waited upon and is in a weird hybrid upcall which are difficult for userspace to control correctly. - systemd-coredump is spawned with full kernel privileges. This necessitates all kinds of weird privilege dropping excercises in userspace to make this safe. - A new usermode helper has to be spawned for each crashing process. This series adds a new mode: (3) Dumping into an AF_UNIX socket. Userspace can set /proc/sys/kernel/core_pattern to: @/path/to/coredump.socket The "@" at the beginning indicates to the kernel that an AF_UNIX coredump socket will be used to process coredumps. The coredump socket must be located in the initial mount namespace. When a task coredumps it opens a client socket in the initial network namespace and connects to the coredump socket. - The coredump server uses SO_PEERPIDFD to get a stable handle on the connected crashing task. The retrieved pidfd will provide a stable reference even if the crashing task gets SIGKILLed while generating the coredump. - By setting core_pipe_limit non-zero userspace can guarantee that the crashing task cannot be reaped behind it's back and thus process all necessary information in /proc/. The SO_PEERPIDFD can be used to detect whether /proc/ still refers to the same process. The core_pipe_limit isn't used to rate-limit connections to the socket. This can simply be done via AF_UNIX sockets directly. - The pidfd for the crashing task will grow new information how the task coredumps. - The coredump server should mark itself as non-dumpable. - A container coredump server in a separate network namespace can simply bind to another well-know address and systemd-coredump fowards coredumps to the container. - Coredumps could in the future also be handled via per-user/session coredump servers that run only with that users privileges. The coredump server listens on the coredump socket and accepts a new coredump connection. It then retrieves SO_PEERPIDFD for the client, inspects uid/gid and hands the accepted client to the users own coredump handler which runs with the users privileges only (It must of coure pay close attention to not forward crashing suid binaries.). The new coredump socket will allow userspace to not have to rely on usermode helpers for processing coredumps and provides a safer way to handle them instead of relying on super privileged coredumping helpers that have and continue to cause significant CVEs. This will also be significantly more lightweight since no fork()+exec() for the usermodehelper is required for each crashing process. The coredump server in userspace can e.g., just keep a worker pool. Link: https://lore.kernel.org/20250516-work-coredump-socket-v8-4-664f3caf2516@kernel.org Acked-by: Luca Boccassi Reviewed-by: Kuniyuki Iwashima Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- include/linux/net.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 0ff950eecc6b..139c85d0f2ea 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -81,6 +81,7 @@ enum sock_type { #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK #endif +#define SOCK_COREDUMP O_NOCTTY #endif /* ARCH_HAS_SOCKET_TYPES */ -- cgit v1.2.3 From 1d8db6fd698de1f73b1a7d72aea578fdd18d9a87 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 16 May 2025 13:25:32 +0200 Subject: pidfs, coredump: add PIDFD_INFO_COREDUMP Extend the PIDFD_INFO_COREDUMP ioctl() with the new PIDFD_INFO_COREDUMP mask flag. This adds the @coredump_mask field to struct pidfd_info. When a task coredumps the kernel will provide the following information to userspace in @coredump_mask: * PIDFD_COREDUMPED is raised if the task did actually coredump. * PIDFD_COREDUMP_SKIP is raised if the task skipped coredumping (e.g., undumpable). * PIDFD_COREDUMP_USER is raised if this is a regular coredump and doesn't need special care by the coredump server. * PIDFD_COREDUMP_ROOT is raised if the generated coredump should be treated as sensitive and the coredump server should restrict to the generated coredump to sufficiently privileged users. The kernel guarantees that by the time the connection is made the all PIDFD_INFO_COREDUMP info is available. Link: https://lore.kernel.org/20250516-work-coredump-socket-v8-5-664f3caf2516@kernel.org Acked-by: Luca Boccassi Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 2676890c4d0d..77e7db194914 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -2,11 +2,16 @@ #ifndef _LINUX_PID_FS_H #define _LINUX_PID_FS_H +struct coredump_params; + struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); void pidfs_exit(struct task_struct *tsk); +#ifdef CONFIG_COREDUMP +void pidfs_coredump(const struct coredump_params *cprm); +#endif extern const struct dentry_operations pidfs_dentry_operations; int pidfs_register_pid(struct pid *pid); void pidfs_get_pid(struct pid *pid); -- cgit v1.2.3 From 01465f296a6871222b185c350423978d44431c96 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 9 May 2025 13:24:50 +0100 Subject: nvmem: Remove unused nvmem cell table support Board files are deprecated by DT, and the last user of nvmem_add_cell_table() was removed by commit 2af4fcc0d3574482 ("ARM: davinci: remove unused board support") in v6.3. Hence remove all support for nvmem cell tables, and update the documentation. Device drivers can still register a single cell using nvmem_add_one_cell() (which was not documented before). Signed-off-by: Geert Uytterhoeven Acked-by: Arnd Bergmann Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250509122452.11827-2-srini@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 515676ebe598..615a560d9edb 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -137,25 +137,6 @@ struct nvmem_config { struct device *base_dev; }; -/** - * struct nvmem_cell_table - NVMEM cell definitions for given provider - * - * @nvmem_name: Provider name. - * @cells: Array of cell definitions. - * @ncells: Number of cell definitions in the array. - * @node: List node. - * - * This structure together with related helper functions is provided for users - * that don't can't access the nvmem provided structure but wish to register - * cell definitions for it e.g. board files registering an EEPROM device. - */ -struct nvmem_cell_table { - const char *nvmem_name; - const struct nvmem_cell_info *cells; - size_t ncells; - struct list_head node; -}; - /** * struct nvmem_layout - NVMEM layout definitions * @@ -190,9 +171,6 @@ void nvmem_unregister(struct nvmem_device *nvmem); struct nvmem_device *devm_nvmem_register(struct device *dev, const struct nvmem_config *cfg); -void nvmem_add_cell_table(struct nvmem_cell_table *table); -void nvmem_del_cell_table(struct nvmem_cell_table *table); - int nvmem_add_one_cell(struct nvmem_device *nvmem, const struct nvmem_cell_info *info); @@ -223,8 +201,6 @@ devm_nvmem_register(struct device *dev, const struct nvmem_config *c) return nvmem_register(c); } -static inline void nvmem_add_cell_table(struct nvmem_cell_table *table) {} -static inline void nvmem_del_cell_table(struct nvmem_cell_table *table) {} static inline int nvmem_add_one_cell(struct nvmem_device *nvmem, const struct nvmem_cell_info *info) { -- cgit v1.2.3 From b803c4a4f78834b31ebfbbcea350473333760559 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Fri, 2 May 2025 02:12:10 +0900 Subject: can: dev: add struct data_bittiming_params to group FD parameters This is a preparation patch for the introduction of CAN XL. CAN FD and CAN XL uses similar bittiming parameters. Add one level of nesting for all the CAN FD parameters. Typically: priv->can.data_bittiming; becomes: priv->can.fd.data_bittiming; This way, the CAN XL equivalent (to be introduced later) would be: priv->can.xl.data_bittiming; Add the new struct data_bittiming_params which contains all the data bittiming parameters, including the TDC and the callback functions. This done, update all the CAN FD drivers to make use of the new layout. Acked-by: Oliver Hartkopp Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250501171213.2161572-2-mailhol.vincent@wanadoo.fr [mkl: fix rcar_canfd] Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 23492213ea35..492d23bec7be 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -38,6 +38,17 @@ enum can_termination_gpio { CAN_TERMINATION_GPIO_MAX, }; +struct data_bittiming_params { + const struct can_bittiming_const *data_bittiming_const; + struct can_bittiming data_bittiming; + const struct can_tdc_const *tdc_const; + struct can_tdc tdc; + const u32 *data_bitrate_const; + unsigned int data_bitrate_const_cnt; + int (*do_set_data_bittiming)(struct net_device *dev); + int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); +}; + /* * CAN common private data */ @@ -45,16 +56,11 @@ struct can_priv { struct net_device *dev; struct can_device_stats can_stats; - const struct can_bittiming_const *bittiming_const, - *data_bittiming_const; - struct can_bittiming bittiming, data_bittiming; - const struct can_tdc_const *tdc_const; - struct can_tdc tdc; - + const struct can_bittiming_const *bittiming_const; + struct can_bittiming bittiming; + struct data_bittiming_params fd; unsigned int bitrate_const_cnt; const u32 *bitrate_const; - const u32 *data_bitrate_const; - unsigned int data_bitrate_const_cnt; u32 bitrate_max; struct can_clock clock; @@ -77,14 +83,12 @@ struct can_priv { struct delayed_work restart_work; int (*do_set_bittiming)(struct net_device *dev); - int (*do_set_data_bittiming)(struct net_device *dev); int (*do_set_mode)(struct net_device *dev, enum can_mode mode); int (*do_set_termination)(struct net_device *dev, u16 term); int (*do_get_state)(const struct net_device *dev, enum can_state *state); int (*do_get_berr_counter)(const struct net_device *dev, struct can_berr_counter *bec); - int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); }; static inline bool can_tdc_is_enabled(const struct can_priv *priv) @@ -114,11 +118,11 @@ static inline bool can_tdc_is_enabled(const struct can_priv *priv) */ static inline s32 can_get_relative_tdco(const struct can_priv *priv) { - const struct can_bittiming *dbt = &priv->data_bittiming; + const struct can_bittiming *dbt = &priv->fd.data_bittiming; s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg + dbt->phase_seg1) * dbt->brp; - return (s32)priv->tdc.tdco - sample_point_in_tc; + return (s32)priv->fd.tdc.tdco - sample_point_in_tc; } /* helper to define static CAN controller features at device creation time */ -- cgit v1.2.3 From c1a606cd75fbe98d261b224b6dfb76d47f40dc12 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:47:58 +0100 Subject: fs/netfs: remove unused flag NETFS_SREQ_SEEK_DATA_READ This flag was added by commit 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers") but its only user was removed by commit 86b374d061ee ("netfs: Remove fs/netfs/io.c"). Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-3-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c86a11cfc4a3..d315d86d0ad4 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -191,7 +191,6 @@ struct netfs_io_subrequest { unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ -#define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ #define NETFS_SREQ_MADE_PROGRESS 4 /* Set if we transferred at least some data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ -- cgit v1.2.3 From 9cd78ca04fb827f42d7c0d492b96fbb940451266 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:47:59 +0100 Subject: fs/netfs: remove unused source NETFS_INVALID_WRITE This enum choice was added by commit 16af134ca4b7 ("netfs: Extend the netfs_io_*request structs to handle writes") and its only user was later removed by commit c245868524cc ("netfs: Remove the old writeback code"). Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-4-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d315d86d0ad4..5a76bea51d24 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -48,7 +48,6 @@ enum netfs_io_source { NETFS_INVALID_READ, NETFS_UPLOAD_TO_SERVER, NETFS_WRITE_TO_CACHE, - NETFS_INVALID_WRITE, } __mode(byte); typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error, -- cgit v1.2.3 From 9fcf235e91fae9f3e99f4e09d332ed09296b11ec Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:48:00 +0100 Subject: fs/netfs: remove unused flag NETFS_ICTX_WRITETHROUGH This flag was added by commit 41d8e7673a77 ("netfs: Implement a write-through caching option") but it was never used. Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-5-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5a76bea51d24..242daec8c837 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -70,7 +70,6 @@ struct netfs_inode { unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ -#define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ #define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ #define NETFS_ICTX_SINGLE_NO_UPLOAD 4 /* Monolithic payload, cache but no upload */ }; -- cgit v1.2.3 From d46a7b217d6abbaea78ce5abf4b295e3c1d819d9 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:48:01 +0100 Subject: fs/netfs: remove unused enum choice NETFS_READ_HOLE_CLEAR This choice was added by commit 3a11b3a86366 ("netfs: Pass more information on how to deal with a hole in the cache") but the last user was removed by commit 86b374d061ee ("netfs: Remove fs/netfs/io.c"). Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-6-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/fscache.h | 3 --- include/linux/netfs.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 9de27643607f..fea0d9779b55 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -498,9 +498,6 @@ static inline void fscache_end_operation(struct netfs_cache_resources *cres) * * NETFS_READ_HOLE_IGNORE - Just try to read (may return a short read). * - * NETFS_READ_HOLE_CLEAR - Seek for data, clearing the part of the buffer - * skipped over, then do as for IGNORE. - * * NETFS_READ_HOLE_FAIL - Give ENODATA if we encounter a hole. */ static inline diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 242daec8c837..73537dafa224 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -318,7 +318,6 @@ struct netfs_request_ops { */ enum netfs_read_from_hole { NETFS_READ_HOLE_IGNORE, - NETFS_READ_HOLE_CLEAR, NETFS_READ_HOLE_FAIL, }; -- cgit v1.2.3 From 314ee7035febc86f4f9452fb90a6c2165a183fe5 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:48:02 +0100 Subject: fs/netfs: reorder struct fields to eliminate holes This shrinks `struct netfs_io_stream` from 104 to 96 bytes and `struct netfs_io_request` from 600 to 576 bytes. [DH: Modified as the patch to turn netfs_io_request::error into a short was removed from the set] Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-7-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 73537dafa224..33f145f7f2c2 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -144,8 +144,8 @@ struct netfs_io_stream { struct netfs_io_subrequest *front; /* Op being collected */ unsigned long long collected_to; /* Position we've collected results to */ size_t transferred; /* The amount transferred from this stream */ - enum netfs_io_source source; /* Where to read from/write to */ unsigned short error; /* Aggregate error for the stream */ + enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* Index of stream in parent table */ bool avail; /* T if stream is available */ bool active; /* T if stream is active */ @@ -240,19 +240,10 @@ struct netfs_io_request { void *netfs_priv; /* Private data for the netfs */ void *netfs_priv2; /* Private data for the netfs */ struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ - unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ - unsigned int debug_id; - unsigned int rsize; /* Maximum read size (0 for none) */ - unsigned int wsize; /* Maximum write size (0 for none) */ - atomic_t subreq_counter; /* Next subreq->debug_index */ - unsigned int nr_group_rel; /* Number of refs to release on ->group */ - spinlock_t lock; /* Lock for queuing subreqs */ unsigned long long submitted; /* Amount submitted for I/O so far */ unsigned long long len; /* Length of the request */ size_t transferred; /* Amount to be indicated as transferred */ long error; /* 0 or error that occurred */ - enum netfs_io_origin origin; /* Origin of the request */ - bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ @@ -260,7 +251,16 @@ struct netfs_io_request { unsigned long long cleaned_to; /* Position we've cleaned folios to */ unsigned long long abandon_to; /* Position to abandon folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ + unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ + unsigned int debug_id; + unsigned int rsize; /* Maximum read size (0 for none) */ + unsigned int wsize; /* Maximum write size (0 for none) */ + atomic_t subreq_counter; /* Next subreq->debug_index */ + unsigned int nr_group_rel; /* Number of refs to release on ->group */ + spinlock_t lock; /* Lock for queuing subreqs */ unsigned char front_folio_order; /* Order (size) of front folio */ + enum netfs_io_origin origin; /* Origin of the request */ + bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ refcount_t ref; unsigned long flags; #define NETFS_RREQ_OFFLOAD_COLLECTION 0 /* Offload collection to workqueue */ -- cgit v1.2.3 From 3dc00bca8dc8226f79f6958293a2777f0691cfb5 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:48:03 +0100 Subject: fs/netfs: remove `netfs_io_request.ractl` Since this field is only used by netfs_prepare_read_iterator() when called by netfs_readahead(), we can simply pass it as parameter. This shrinks the struct from 576 to 568 bytes. Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-8-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 33f145f7f2c2..2b127527544e 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -228,7 +228,6 @@ struct netfs_io_request { struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; struct netfs_io_request *copy_to_cache; /* Request to write just-read data to the cache */ - struct readahead_control *ractl; /* Readahead descriptor */ struct list_head proc_link; /* Link in netfs_iorequests */ struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams -- cgit v1.2.3 From 07c08bac9302012d9a91d40e95c8f98800f7d787 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:48:04 +0100 Subject: fs/netfs: declare field `proc_link` only if CONFIG_PROC_FS=y This field is only used for the "proc" filesystem. Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-9-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 2b127527544e..3f7056d837f8 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -228,7 +228,9 @@ struct netfs_io_request { struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; struct netfs_io_request *copy_to_cache; /* Request to write just-read data to the cache */ +#ifdef CONFIG_PROC_FS struct list_head proc_link; /* Link in netfs_iorequests */ +#endif struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ -- cgit v1.2.3 From 6bb09e5db3a07cf3e03f25f725bd8edc959e38ba Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:48:05 +0100 Subject: folio_queue: remove unused field `marks3` The last user was removed by commit e2d46f2ec332 ("netfs: Change the read result collector to only use one work item"). Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-10-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/folio_queue.h | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 45ad2408a80c..adab609c972e 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -34,7 +34,6 @@ struct folio_queue { struct folio_queue *prev; /* Previous queue segment of NULL */ unsigned long marks; /* 1-bit mark per folio */ unsigned long marks2; /* Second 1-bit mark per folio */ - unsigned long marks3; /* Third 1-bit mark per folio */ #if PAGEVEC_SIZE > BITS_PER_LONG #error marks is not big enough #endif @@ -58,7 +57,6 @@ static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id) folioq->prev = NULL; folioq->marks = 0; folioq->marks2 = 0; - folioq->marks3 = 0; folioq->rreq_id = rreq_id; folioq->debug_id = 0; } @@ -178,45 +176,6 @@ static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) clear_bit(slot, &folioq->marks2); } -/** - * folioq_is_marked3: Check third folio mark in a folio queue segment - * @folioq: The segment to query - * @slot: The slot number of the folio to query - * - * Determine if the third mark is set for the folio in the specified slot in a - * folio queue segment. - */ -static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot) -{ - return test_bit(slot, &folioq->marks3); -} - -/** - * folioq_mark3: Set the third mark on a folio in a folio queue segment - * @folioq: The segment to modify - * @slot: The slot number of the folio to modify - * - * Set the third mark for the folio in the specified slot in a folio queue - * segment. - */ -static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot) -{ - set_bit(slot, &folioq->marks3); -} - -/** - * folioq_unmark3: Clear the third mark on a folio in a folio queue segment - * @folioq: The segment to modify - * @slot: The slot number of the folio to modify - * - * Clear the third mark for the folio in the specified slot in a folio queue - * segment. - */ -static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot) -{ - clear_bit(slot, &folioq->marks3); -} - /** * folioq_append: Add a folio to a folio queue segment * @folioq: The segment to add to @@ -318,7 +277,6 @@ static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) folioq->vec.folios[slot] = NULL; folioq_unmark(folioq, slot); folioq_unmark2(folioq, slot); - folioq_unmark3(folioq, slot); } #endif /* _LINUX_FOLIO_QUEUE_H */ -- cgit v1.2.3 From 67b916719a15c33fd18d6e8298828caeef428d00 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:48:06 +0100 Subject: fs/netfs: remove unused flag NETFS_RREQ_DONT_UNLOCK_FOLIOS NETFS_RREQ_DONT_UNLOCK_FOLIOS has never been used ever since it was added by commit 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers"). Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-11-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3f7056d837f8..5f60d8e3a7ef 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -266,7 +266,6 @@ struct netfs_io_request { unsigned long flags; #define NETFS_RREQ_OFFLOAD_COLLECTION 0 /* Offload collection to workqueue */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ -#define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ #define NETFS_RREQ_FOLIO_COPY_TO_CACHE 6 /* Copy current folio to cache from read */ -- cgit v1.2.3 From 4b1ca12dd3f2529dc788cf4f18259ed62006ccb8 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 19 May 2025 14:48:07 +0100 Subject: fs/netfs: remove unused flag NETFS_RREQ_BLOCKED NETFS_RREQ_BLOCKED was added by commit 016dc8516aec ("netfs: Implement unbuffered/DIO read support") but has never been used either. Without NETFS_RREQ_BLOCKED, NETFS_RREQ_NONBLOCK makes no sense, and thus can be removed as well. Signed-off-by: Max Kellermann Signed-off-by: David Howells Link: https://lore.kernel.org/20250519134813.2975312-12-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5f60d8e3a7ef..cf634c28522d 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -270,8 +270,6 @@ struct netfs_io_request { #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ #define NETFS_RREQ_FOLIO_COPY_TO_CACHE 6 /* Copy current folio to cache from read */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ -#define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ -#define NETFS_RREQ_BLOCKED 10 /* We blocked */ #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ -- cgit v1.2.3 From 34eb98c6598c4057640ca56dd1fad6555187473a Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 19 May 2025 10:07:02 +0100 Subject: netfs: Fix setting of transferred bytes with short DIO reads A netfslib request comprises an ordered stream of subrequests that, when doing an unbuffered/DIO read, are contiguous. The subrequests may be performed in parallel, but may not be fully completed. For instance, if we try and make a 256KiB DIO read from a 3-byte file with a 64KiB rsize and 256KiB bsize, netfslib will attempt to make a read of 256KiB, broken up into four 64KiB subreads, with the expectation that the first will be short and the subsequent three be completely devoid - but we do all four on the basis that the file may have been changed by a third party. The read-collection code, however, walks through all the subreqs and advances the notion of how much data has been read in the stream to the start of each subreq plus its amount transferred (which are 3, 0, 0, 0 for the example above) - which gives an amount apparently read of 3*64KiB - which is incorrect. Fix the collection code to cut short the calculation of the transferred amount with the first short subrequest in an unbuffered read; everything beyond that must be ignored as there's a hole that cannot be filled. This applies both to shortness due to hitting the EOF and shortness due to an error. This is achieved by setting a flag on the request when we collect the first short subrequest (collection is done in ascending order). This can be tested by mounting a cifs volume with rsize=65536,bsize=262144 and doing a 256k DIO read of a very small file (e.g. 3 bytes). read() should return 3, not >3. This problem came in when netfs_read_collection() set rreq->transferred to stream->transferred, even for DIO. Prior to that, netfs_rreq_assess_dio() just went over the list and added up the subreqs till it met a short one - but now the subreqs are discarded earlier. Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Reported-by: Nicolas Baranger Closes: https://lore.kernel.org/all/10bec2430ed4df68bde10ed95295d093@3xo.fr/ Signed-off-by: "Paulo Alcantara (Red Hat)" Signed-off-by: David Howells Link: https://lore.kernel.org/20250519090707.2848510-3-dhowells@redhat.com cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c86a11cfc4a3..497c4f4698f6 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -279,6 +279,7 @@ struct netfs_io_request { #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ #define NETFS_RREQ_RETRYING 14 /* Set if we're in the retry path */ +#define NETFS_RREQ_SHORT_TRANSFER 15 /* Set if we have a short transfer */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; -- cgit v1.2.3 From 20d72b00ca814d748f5663484e5c53bb2bf37a3a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 19 May 2025 10:07:03 +0100 Subject: netfs: Fix the request's work item to not require a ref When the netfs_io_request struct's work item is queued, it must be supplied with a ref to the work item struct to prevent it being deallocated whilst on the queue or whilst it is being processed. This is tricky to manage as we have to get a ref before we try and queue it and then we may find it's already queued and is thus already holding a ref - in which case we have to try and get rid of the ref again. The problem comes if we're in BH or IRQ context and need to drop the ref: if netfs_put_request() reduces the count to 0, we have to do the cleanup - but the cleanup may need to wait. Fix this by adding a new work item to the request, ->cleanup_work, and dispatching that when the refcount hits zero. That can then synchronously cancel any outstanding work on the main work item before doing the cleanup. Adding a new work item also deals with another problem upstream where it's sometimes changing the work func in the put function and requeuing it - which has occasionally in the past caused the cleanup to happen incorrectly. As a bonus, this allows us to get rid of the 'was_async' parameter from a bunch of functions. This indicated whether the put function might not be permitted to sleep. Fixes: 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers") Signed-off-by: David Howells Link: https://lore.kernel.org/20250519090707.2848510-4-dhowells@redhat.com cc: Paulo Alcantara cc: Marc Dionne cc: Steve French cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/fscache.h | 2 +- include/linux/netfs.h | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 9de27643607f..266e6c9e6f83 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -628,7 +628,7 @@ static inline void fscache_write_to_cache(struct fscache_cookie *cookie, term_func, term_func_priv, using_pgpriv2, caching); else if (term_func) - term_func(term_func_priv, -ENOBUFS, false); + term_func(term_func_priv, -ENOBUFS); } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 497c4f4698f6..c3f230732f51 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -51,8 +51,7 @@ enum netfs_io_source { NETFS_INVALID_WRITE, } __mode(byte); -typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error, - bool was_async); +typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error); /* * Per-inode context. This wraps the VFS inode. @@ -223,9 +222,10 @@ enum netfs_io_origin { */ struct netfs_io_request { union { - struct work_struct work; + struct work_struct cleanup_work; /* Deferred cleanup work */ struct rcu_head rcu; }; + struct work_struct work; /* Result collector work */ struct inode *inode; /* The file being accessed */ struct address_space *mapping; /* The mapping being accessed */ struct kiocb *iocb; /* AIO completion vector */ @@ -270,7 +270,7 @@ struct netfs_io_request { #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ -#define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ +#define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes (has ref) */ #define NETFS_RREQ_FOLIO_COPY_TO_CACHE 6 /* Copy current folio to cache from read */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ @@ -440,15 +440,14 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, - bool was_async, enum netfs_sreq_ref_trace what); + enum netfs_sreq_ref_trace what); ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, iov_iter_extraction_t extraction_flags); size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs); void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq); -void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, - bool was_async); +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error); void netfs_queue_write_request(struct netfs_io_subrequest *subreq); int netfs_start_io_read(struct inode *inode); -- cgit v1.2.3 From 8f08055bc67a355aca55856cc810b89645506a5f Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 13 Apr 2025 11:34:28 +0100 Subject: iio: introduced iio_push_to_buffers_with_ts() that takes a data_total_len argument. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check that data_total_len argument against iio_dev->scan_bytes. The size needs to be at least as big as the scan. It can be larger, which is typical if only part of fixed sized storage is used due to a subset of channels being enabled. Reviewed-by: Nuno Sá Reviewed-by: David Lechner Link: https://patch.msgid.link/20250413103443.2420727-6-jic23@kernel.org Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index 3b8d618bb3df..5c84ec4a9810 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -45,6 +45,18 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev, return iio_push_to_buffers(indio_dev, data); } +static inline int iio_push_to_buffers_with_ts(struct iio_dev *indio_dev, + void *data, size_t data_total_len, + s64 timestamp) +{ + if (unlikely(data_total_len < indio_dev->scan_bytes)) { + dev_err(&indio_dev->dev, "Undersized storage pushed to buffer\n"); + return -ENOSPC; + } + + return iio_push_to_buffers_with_timestamp(indio_dev, data, timestamp); +} + int iio_push_to_buffers_with_ts_unaligned(struct iio_dev *indio_dev, const void *data, size_t data_sz, int64_t timestamp); -- cgit v1.2.3 From 7e00d74eacf77c98a60e7c754a9e5f73d8832095 Mon Sep 17 00:00:00 2001 From: Chelsy Ratnawat Date: Thu, 1 May 2025 17:36:55 -0700 Subject: HID: sensor-hub: Fix typo and improve documentation for sensor_hub_remove_callback() Fixed a typo in "registered" and improved grammar for better readability and consistency with kernel-doc standards. No functional changes. Signed-off-by: Chelsy Ratnawat Reviewed-by: David Lechner Link: https://patch.msgid.link/20250502003655.1943000-1-chelsyratnawat2001@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/hid-sensor-hub.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h index c27329e2a5ad..0f9f7df865db 100644 --- a/include/linux/hid-sensor-hub.h +++ b/include/linux/hid-sensor-hub.h @@ -128,12 +128,13 @@ int sensor_hub_register_callback(struct hid_sensor_hub_device *hsdev, struct hid_sensor_hub_callbacks *usage_callback); /** -* sensor_hub_remove_callback() - Remove client callbacks +* sensor_hub_remove_callback() - Remove client callback * @hsdev: Hub device instance. -* @usage_id: Usage id of the client (E.g. 0x200076 for Gyro). +* @usage_id: Usage id of the client (e.g. 0x200076 for gyro). * -* If there is a callback registred, this call will remove that -* callbacks, so that it will stop data and event notifications. +* Removes a previously registered callback for the given usage_id +* and hsdev. Once removed, the client will no longer receive data or +* event notifications. */ int sensor_hub_remove_callback(struct hid_sensor_hub_device *hsdev, u32 usage_id); -- cgit v1.2.3 From fa19c303254bae5b8d105ecdc7d714d092f2adda Mon Sep 17 00:00:00 2001 From: David Lechner Date: Wed, 7 May 2025 15:42:40 -0500 Subject: iio: make IIO_DMA_MINALIGN minimum of 8 bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a condition to ensure that IIO_DMA_MINALIGN is at least 8 bytes. On some 32-bit architectures, IIO_DMA_MINALIGN is 4. In many cases, drivers are using this alignment for buffers that include a 64-bit timestamp that is used with iio_push_to_buffers_with_ts(), which expects the timestamp to be aligned to 8 bytes. To handle this, we can just make IIO_DMA_MINALIGN at least 8 bytes. Reviewed-by: Nuno Sá Signed-off-by: David Lechner Link: https://patch.msgid.link/20250507-iio-introduce-iio_declare_buffer_with_ts-v6-1-4aee1b9f1b89@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 638cf2420fbd..a574f22398e4 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include /* IIO TODO LIST */ @@ -775,8 +776,14 @@ static inline void *iio_device_get_drvdata(const struct iio_dev *indio_dev) * to in turn include IIO_DMA_MINALIGN'd elements such as buffers which * must not share cachelines with the rest of the structure, thus making * them safe for use with non-coherent DMA. + * + * A number of drivers also use this on buffers that include a 64-bit timestamp + * that is used with iio_push_to_buffer_with_ts(). Therefore, in the case where + * DMA alignment is not sufficient for proper timestamp alignment, we align to + * 8 bytes instead. */ -#define IIO_DMA_MINALIGN ARCH_DMA_MINALIGN +#define IIO_DMA_MINALIGN MAX(ARCH_DMA_MINALIGN, sizeof(s64)) + struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv); /* The information at the returned address is guaranteed to be cacheline aligned */ -- cgit v1.2.3 From 63fc53526d3090b27bd06bb43b92e8bc85f46fb1 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Wed, 7 May 2025 15:42:41 -0500 Subject: iio: introduce IIO_DECLARE_BUFFER_WITH_TS macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new macros to help with the common case of declaring a buffer that is safe to use with iio_push_to_buffers_with_ts(). This is not trivial to do correctly because of the alignment requirements of the timestamp. This will make it easier for both authors and reviewers. To avoid double __align() attributes in cases where we also need DMA alignment, add a 2nd variant IIO_DECLARE_DMA_BUFFER_WITH_TS(). Reviewed-by: Nuno Sá Signed-off-by: David Lechner Link: https://patch.msgid.link/20250507-iio-introduce-iio_declare_buffer_with_ts-v6-2-4aee1b9f1b89@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index a574f22398e4..d11668f14a3e 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -7,6 +7,7 @@ #ifndef _INDUSTRIAL_IO_H_ #define _INDUSTRIAL_IO_H_ +#include #include #include #include @@ -784,6 +785,37 @@ static inline void *iio_device_get_drvdata(const struct iio_dev *indio_dev) */ #define IIO_DMA_MINALIGN MAX(ARCH_DMA_MINALIGN, sizeof(s64)) +#define __IIO_DECLARE_BUFFER_WITH_TS(type, name, count) \ + type name[ALIGN((count), sizeof(s64) / sizeof(type)) + sizeof(s64) / sizeof(type)] + +/** + * IIO_DECLARE_BUFFER_WITH_TS() - Declare a buffer with timestamp + * @type: element type of the buffer + * @name: identifier name of the buffer + * @count: number of elements in the buffer + * + * Declares a buffer that is safe to use with iio_push_to_buffer_with_ts(). In + * addition to allocating enough space for @count elements of @type, it also + * allocates space for a s64 timestamp at the end of the buffer and ensures + * proper alignment of the timestamp. + */ +#define IIO_DECLARE_BUFFER_WITH_TS(type, name, count) \ + __IIO_DECLARE_BUFFER_WITH_TS(type, name, count) __aligned(sizeof(s64)) + +/** + * IIO_DECLARE_DMA_BUFFER_WITH_TS() - Declare a DMA-aligned buffer with timestamp + * @type: element type of the buffer + * @name: identifier name of the buffer + * @count: number of elements in the buffer + * + * Same as IIO_DECLARE_BUFFER_WITH_TS(), but is uses __aligned(IIO_DMA_MINALIGN) + * to ensure that the buffer doesn't share cachelines with anything that comes + * before it in a struct. This should not be used for stack-allocated buffers + * as stack memory cannot generally be used for DMA. + */ +#define IIO_DECLARE_DMA_BUFFER_WITH_TS(type, name, count) \ + __IIO_DECLARE_BUFFER_WITH_TS(type, name, count) __aligned(IIO_DMA_MINALIGN) + struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv); /* The information at the returned address is guaranteed to be cacheline aligned */ -- cgit v1.2.3 From 27737b8407585c0160063b9b39e888d486d86188 Mon Sep 17 00:00:00 2001 From: Chelsy Ratnawat Date: Tue, 6 May 2025 22:57:45 -0700 Subject: HID: sensor-hub: Fix typo and improve documentation Includes the following corrections - - Changed Measurment -> Measurement - Changed clode -> close - Gyro -> gyro Signed-off-by: Chelsy Ratnawat Reviewed-by: David Lechner Link: https://patch.msgid.link/20250507055745.4069933-1-chelsyratnawat2001@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/hid-sensor-hub.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h index 0f9f7df865db..e71056553108 100644 --- a/include/linux/hid-sensor-hub.h +++ b/include/linux/hid-sensor-hub.h @@ -17,7 +17,7 @@ * @attrib_id: Attribute id for this attribute. * @report_id: Report id in which this information resides. * @index: Field index in the report. - * @units: Measurment unit for this attribute. + * @units: Measurement unit for this attribute. * @unit_expo: Exponent used in the data. * @size: Size in bytes for data size. * @logical_minimum: Logical minimum value for this attribute. @@ -39,8 +39,8 @@ struct hid_sensor_hub_attribute_info { * struct sensor_hub_pending - Synchronous read pending information * @status: Pending status true/false. * @ready: Completion synchronization data. - * @usage_id: Usage id for physical device, E.g. Gyro usage id. - * @attr_usage_id: Usage Id of a field, E.g. X-AXIS for a gyro. + * @usage_id: Usage id for physical device, e.g. gyro usage id. + * @attr_usage_id: Usage Id of a field, e.g. X-axis for a gyro. * @raw_size: Response size for a read request. * @raw_data: Place holder for received response. */ @@ -104,10 +104,10 @@ struct hid_sensor_hub_callbacks { int sensor_hub_device_open(struct hid_sensor_hub_device *hsdev); /** -* sensor_hub_device_clode() - Close hub device +* sensor_hub_device_close() - Close hub device * @hsdev: Hub device instance. * -* Used to clode hid device for sensor hub. +* Used to close hid device for sensor hub. */ void sensor_hub_device_close(struct hid_sensor_hub_device *hsdev); -- cgit v1.2.3 From 46550e2b878d60923c72f0526a7aac02e8eda3d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sun, 4 May 2025 20:22:44 +0200 Subject: include: pe.h: Fix PE definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Rename constants to their standard PE names: - MZ_MAGIC -> IMAGE_DOS_SIGNATURE - PE_MAGIC -> IMAGE_NT_SIGNATURE - PE_OPT_MAGIC_PE32_ROM -> IMAGE_ROM_OPTIONAL_HDR_MAGIC - PE_OPT_MAGIC_PE32 -> IMAGE_NT_OPTIONAL_HDR32_MAGIC - PE_OPT_MAGIC_PE32PLUS -> IMAGE_NT_OPTIONAL_HDR64_MAGIC - IMAGE_DLL_CHARACTERISTICS_NX_COMPAT -> IMAGE_DLLCHARACTERISTICS_NX_COMPAT * Import constants and their description from readpe and file projects which contains current up-to-date information: - IMAGE_FILE_MACHINE_* - IMAGE_FILE_* - IMAGE_SUBSYSTEM_* - IMAGE_DLLCHARACTERISTICS_* - IMAGE_DLLCHARACTERISTICS_EX_* - IMAGE_DEBUG_TYPE_* * Add missing IMAGE_SCN_* constants and update their incorrect description * Fix incorrect value of IMAGE_SCN_MEM_PURGEABLE constant * Add description for win32_version and loader_flags PE fields Signed-off-by: Pali Rohár Signed-off-by: Ard Biesheuvel --- include/linux/pe.h | 279 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 174 insertions(+), 105 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pe.h b/include/linux/pe.h index fdf9c95709ba..cd2b7275385f 100644 --- a/include/linux/pe.h +++ b/include/linux/pe.h @@ -39,113 +39,160 @@ */ #define LINUX_PE_MAGIC 0x818223cd -#define MZ_MAGIC 0x5a4d /* "MZ" */ +#define IMAGE_DOS_SIGNATURE 0x5a4d /* "MZ" */ -#define PE_MAGIC 0x00004550 /* "PE\0\0" */ -#define PE_OPT_MAGIC_PE32 0x010b -#define PE_OPT_MAGIC_PE32_ROM 0x0107 -#define PE_OPT_MAGIC_PE32PLUS 0x020b +#define IMAGE_NT_SIGNATURE 0x00004550 /* "PE\0\0" */ + +#define IMAGE_ROM_OPTIONAL_HDR_MAGIC 0x0107 /* ROM image (for R3000/R4000/R10000/ALPHA), without MZ and PE\0\0 sign */ +#define IMAGE_NT_OPTIONAL_HDR32_MAGIC 0x010b /* PE32 executable image */ +#define IMAGE_NT_OPTIONAL_HDR64_MAGIC 0x020b /* PE32+ executable image */ /* machine type */ -#define IMAGE_FILE_MACHINE_UNKNOWN 0x0000 -#define IMAGE_FILE_MACHINE_AM33 0x01d3 -#define IMAGE_FILE_MACHINE_AMD64 0x8664 -#define IMAGE_FILE_MACHINE_ARM 0x01c0 -#define IMAGE_FILE_MACHINE_ARMV7 0x01c4 -#define IMAGE_FILE_MACHINE_ARM64 0xaa64 -#define IMAGE_FILE_MACHINE_EBC 0x0ebc -#define IMAGE_FILE_MACHINE_I386 0x014c -#define IMAGE_FILE_MACHINE_IA64 0x0200 -#define IMAGE_FILE_MACHINE_M32R 0x9041 -#define IMAGE_FILE_MACHINE_MIPS16 0x0266 -#define IMAGE_FILE_MACHINE_MIPSFPU 0x0366 -#define IMAGE_FILE_MACHINE_MIPSFPU16 0x0466 -#define IMAGE_FILE_MACHINE_POWERPC 0x01f0 -#define IMAGE_FILE_MACHINE_POWERPCFP 0x01f1 -#define IMAGE_FILE_MACHINE_R4000 0x0166 -#define IMAGE_FILE_MACHINE_RISCV32 0x5032 -#define IMAGE_FILE_MACHINE_RISCV64 0x5064 -#define IMAGE_FILE_MACHINE_RISCV128 0x5128 -#define IMAGE_FILE_MACHINE_SH3 0x01a2 -#define IMAGE_FILE_MACHINE_SH3DSP 0x01a3 -#define IMAGE_FILE_MACHINE_SH3E 0x01a4 -#define IMAGE_FILE_MACHINE_SH4 0x01a6 -#define IMAGE_FILE_MACHINE_SH5 0x01a8 -#define IMAGE_FILE_MACHINE_THUMB 0x01c2 -#define IMAGE_FILE_MACHINE_WCEMIPSV2 0x0169 -#define IMAGE_FILE_MACHINE_LOONGARCH32 0x6232 -#define IMAGE_FILE_MACHINE_LOONGARCH64 0x6264 +#define IMAGE_FILE_MACHINE_UNKNOWN 0x0000 /* Unknown architecture */ +#define IMAGE_FILE_MACHINE_TARGET_HOST 0x0001 /* Interacts with the host and not a WOW64 guest (not for file image) */ +#define IMAGE_FILE_MACHINE_ALPHA_OLD 0x0183 /* DEC Alpha AXP 32-bit (old images) */ +#define IMAGE_FILE_MACHINE_ALPHA 0x0184 /* DEC Alpha AXP 32-bit */ +#define IMAGE_FILE_MACHINE_ALPHA64 0x0284 /* DEC Alpha AXP 64-bit (with 8kB page size) */ +#define IMAGE_FILE_MACHINE_AXP64 IMAGE_FILE_MACHINE_ALPHA64 +#define IMAGE_FILE_MACHINE_AM33 0x01d3 /* Matsushita AM33, now Panasonic MN103 */ +#define IMAGE_FILE_MACHINE_AMD64 0x8664 /* AMD64 (x64) */ +#define IMAGE_FILE_MACHINE_ARM 0x01c0 /* ARM Little-Endian (ARMv4) */ +#define IMAGE_FILE_MACHINE_THUMB 0x01c2 /* ARM Thumb Little-Endian (ARMv4T) */ +#define IMAGE_FILE_MACHINE_ARMNT 0x01c4 /* ARM Thumb-2 Little-Endian (ARMv7) */ +#define IMAGE_FILE_MACHINE_ARMV7 IMAGE_FILE_MACHINE_ARMNT +#define IMAGE_FILE_MACHINE_ARM64 0xaa64 /* ARM64 Little-Endian (Classic ABI) */ +#define IMAGE_FILE_MACHINE_ARM64EC 0xa641 /* ARM64 Little-Endian (Emulation Compatible ABI for AMD64) */ +#define IMAGE_FILE_MACHINE_ARM64X 0xa64e /* ARM64 Little-Endian (fat binary with both Classic ABI and EC ABI code) */ +#define IMAGE_FILE_MACHINE_CEE 0xc0ee /* COM+ Execution Engine (CLR pure MSIL object files) */ +#define IMAGE_FILE_MACHINE_CEF 0x0cef /* Windows CE 3.0 Common Executable Format (CEF bytecode) */ +#define IMAGE_FILE_MACHINE_CHPE_X86 0x3a64 /* ARM64 Little-Endian (Compiled Hybrid PE ABI for I386) */ +#define IMAGE_FILE_MACHINE_HYBRID_X86 IMAGE_FILE_MACHINE_CHPE_X86 +#define IMAGE_FILE_MACHINE_EBC 0x0ebc /* EFI/UEFI Byte Code */ +#define IMAGE_FILE_MACHINE_I386 0x014c /* Intel 386 (x86) */ +#define IMAGE_FILE_MACHINE_I860 0x014d /* Intel 860 (N10) */ +#define IMAGE_FILE_MACHINE_IA64 0x0200 /* Intel IA-64 (with 8kB page size) */ +#define IMAGE_FILE_MACHINE_LOONGARCH32 0x6232 /* LoongArch 32-bit processor family */ +#define IMAGE_FILE_MACHINE_LOONGARCH64 0x6264 /* LoongArch 64-bit processor family */ +#define IMAGE_FILE_MACHINE_M32R 0x9041 /* Mitsubishi M32R 32-bit Little-Endian */ +#define IMAGE_FILE_MACHINE_M68K 0x0268 /* Motorola 68000 series */ +#define IMAGE_FILE_MACHINE_MIPS16 0x0266 /* MIPS III with MIPS16 ASE Little-Endian */ +#define IMAGE_FILE_MACHINE_MIPSFPU 0x0366 /* MIPS III with FPU Little-Endian */ +#define IMAGE_FILE_MACHINE_MIPSFPU16 0x0466 /* MIPS III with MIPS16 ASE and FPU Little-Endian */ +#define IMAGE_FILE_MACHINE_MPPC_601 0x0601 /* PowerPC 32-bit Big-Endian */ +#define IMAGE_FILE_MACHINE_OMNI 0xace1 /* Microsoft OMNI VM (omniprox.dll) */ +#define IMAGE_FILE_MACHINE_PARISC 0x0290 /* HP PA-RISC */ +#define IMAGE_FILE_MACHINE_POWERPC 0x01f0 /* PowerPC 32-bit Little-Endian */ +#define IMAGE_FILE_MACHINE_POWERPCFP 0x01f1 /* PowerPC 32-bit with FPU Little-Endian */ +#define IMAGE_FILE_MACHINE_POWERPCBE 0x01f2 /* PowerPC 64-bit Big-Endian */ +#define IMAGE_FILE_MACHINE_R3000 0x0162 /* MIPS I Little-Endian */ +#define IMAGE_FILE_MACHINE_R3000_BE 0x0160 /* MIPS I Big-Endian */ +#define IMAGE_FILE_MACHINE_R4000 0x0166 /* MIPS III Little-Endian (with 1kB or 4kB page size) */ +#define IMAGE_FILE_MACHINE_R10000 0x0168 /* MIPS IV Little-Endian */ +#define IMAGE_FILE_MACHINE_RISCV32 0x5032 /* RISC-V 32-bit address space */ +#define IMAGE_FILE_MACHINE_RISCV64 0x5064 /* RISC-V 64-bit address space */ +#define IMAGE_FILE_MACHINE_RISCV128 0x5128 /* RISC-V 128-bit address space */ +#define IMAGE_FILE_MACHINE_SH3 0x01a2 /* Hitachi SH-3 32-bit Little-Endian (with 1kB page size) */ +#define IMAGE_FILE_MACHINE_SH3DSP 0x01a3 /* Hitachi SH-3 DSP 32-bit (with 1kB page size) */ +#define IMAGE_FILE_MACHINE_SH3E 0x01a4 /* Hitachi SH-3E Little-Endian (with 1kB page size) */ +#define IMAGE_FILE_MACHINE_SH4 0x01a6 /* Hitachi SH-4 32-bit Little-Endian (with 1kB page size) */ +#define IMAGE_FILE_MACHINE_SH5 0x01a8 /* Hitachi SH-5 64-bit */ +#define IMAGE_FILE_MACHINE_TAHOE 0x07cc /* Intel EM machine */ +#define IMAGE_FILE_MACHINE_TRICORE 0x0520 /* Infineon AUDO 32-bit */ +#define IMAGE_FILE_MACHINE_WCEMIPSV2 0x0169 /* MIPS Windows CE v2 Little-Endian */ /* flags */ -#define IMAGE_FILE_RELOCS_STRIPPED 0x0001 -#define IMAGE_FILE_EXECUTABLE_IMAGE 0x0002 -#define IMAGE_FILE_LINE_NUMS_STRIPPED 0x0004 -#define IMAGE_FILE_LOCAL_SYMS_STRIPPED 0x0008 -#define IMAGE_FILE_AGGRESSIVE_WS_TRIM 0x0010 -#define IMAGE_FILE_LARGE_ADDRESS_AWARE 0x0020 -#define IMAGE_FILE_16BIT_MACHINE 0x0040 -#define IMAGE_FILE_BYTES_REVERSED_LO 0x0080 -#define IMAGE_FILE_32BIT_MACHINE 0x0100 -#define IMAGE_FILE_DEBUG_STRIPPED 0x0200 -#define IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP 0x0400 -#define IMAGE_FILE_NET_RUN_FROM_SWAP 0x0800 -#define IMAGE_FILE_SYSTEM 0x1000 -#define IMAGE_FILE_DLL 0x2000 -#define IMAGE_FILE_UP_SYSTEM_ONLY 0x4000 -#define IMAGE_FILE_BYTES_REVERSED_HI 0x8000 - -#define IMAGE_FILE_OPT_ROM_MAGIC 0x107 -#define IMAGE_FILE_OPT_PE32_MAGIC 0x10b -#define IMAGE_FILE_OPT_PE32_PLUS_MAGIC 0x20b - -#define IMAGE_SUBSYSTEM_UNKNOWN 0 -#define IMAGE_SUBSYSTEM_NATIVE 1 -#define IMAGE_SUBSYSTEM_WINDOWS_GUI 2 -#define IMAGE_SUBSYSTEM_WINDOWS_CUI 3 -#define IMAGE_SUBSYSTEM_POSIX_CUI 7 -#define IMAGE_SUBSYSTEM_WINDOWS_CE_GUI 9 -#define IMAGE_SUBSYSTEM_EFI_APPLICATION 10 -#define IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER 11 -#define IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER 12 -#define IMAGE_SUBSYSTEM_EFI_ROM_IMAGE 13 -#define IMAGE_SUBSYSTEM_XBOX 14 - -#define IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE 0x0040 -#define IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY 0x0080 -#define IMAGE_DLL_CHARACTERISTICS_NX_COMPAT 0x0100 -#define IMAGE_DLLCHARACTERISTICS_NO_ISOLATION 0x0200 -#define IMAGE_DLLCHARACTERISTICS_NO_SEH 0x0400 -#define IMAGE_DLLCHARACTERISTICS_NO_BIND 0x0800 -#define IMAGE_DLLCHARACTERISTICS_WDM_DRIVER 0x2000 -#define IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE 0x8000 - -#define IMAGE_DLLCHARACTERISTICS_EX_CET_COMPAT 0x0001 -#define IMAGE_DLLCHARACTERISTICS_EX_FORWARD_CFI_COMPAT 0x0040 - -/* they actually defined 0x00000000 as well, but I think we'll skip that one. */ -#define IMAGE_SCN_RESERVED_0 0x00000001 -#define IMAGE_SCN_RESERVED_1 0x00000002 -#define IMAGE_SCN_RESERVED_2 0x00000004 -#define IMAGE_SCN_TYPE_NO_PAD 0x00000008 /* don't pad - obsolete */ -#define IMAGE_SCN_RESERVED_3 0x00000010 +#define IMAGE_FILE_RELOCS_STRIPPED 0x0001 /* Relocation info stripped from file */ +#define IMAGE_FILE_EXECUTABLE_IMAGE 0x0002 /* File is executable (i.e. no unresolved external references) */ +#define IMAGE_FILE_LINE_NUMS_STRIPPED 0x0004 /* Line nunbers stripped from file */ +#define IMAGE_FILE_LOCAL_SYMS_STRIPPED 0x0008 /* Local symbols stripped from file */ +#define IMAGE_FILE_AGGRESSIVE_WS_TRIM 0x0010 /* Aggressively trim working set */ +#define IMAGE_FILE_LARGE_ADDRESS_AWARE 0x0020 /* App can handle >2gb addresses (image can be loaded at address above 2GB) */ +#define IMAGE_FILE_16BIT_MACHINE 0x0040 /* 16 bit word machine */ +#define IMAGE_FILE_BYTES_REVERSED_LO 0x0080 /* Bytes of machine word are reversed (should be set together with IMAGE_FILE_BYTES_REVERSED_HI) */ +#define IMAGE_FILE_32BIT_MACHINE 0x0100 /* 32 bit word machine */ +#define IMAGE_FILE_DEBUG_STRIPPED 0x0200 /* Debugging info stripped from file in .DBG file */ +#define IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP 0x0400 /* If Image is on removable media, copy and run from the swap file */ +#define IMAGE_FILE_NET_RUN_FROM_SWAP 0x0800 /* If Image is on Net, copy and run from the swap file */ +#define IMAGE_FILE_SYSTEM 0x1000 /* System kernel-mode file (can't be loaded in user-mode) */ +#define IMAGE_FILE_DLL 0x2000 /* File is a DLL */ +#define IMAGE_FILE_UP_SYSTEM_ONLY 0x4000 /* File should only be run on a UP (uniprocessor) machine */ +#define IMAGE_FILE_BYTES_REVERSED_HI 0x8000 /* Bytes of machine word are reversed (should be set together with IMAGE_FILE_BYTES_REVERSED_LO) */ + +/* subsys */ +#define IMAGE_SUBSYSTEM_UNKNOWN 0 /* Unknown subsystem */ +#define IMAGE_SUBSYSTEM_NATIVE 1 /* No subsystem required (NT device drivers and NT native system processes) */ +#define IMAGE_SUBSYSTEM_WINDOWS_GUI 2 /* Windows graphical user interface (GUI) subsystem */ +#define IMAGE_SUBSYSTEM_WINDOWS_CUI 3 /* Windows character-mode user interface (CUI) subsystem */ +#define IMAGE_SUBSYSTEM_WINDOWS_OLD_CE_GUI 4 /* Old Windows CE subsystem */ +#define IMAGE_SUBSYSTEM_OS2_CUI 5 /* OS/2 CUI subsystem */ +#define IMAGE_SUBSYSTEM_RESERVED_6 6 +#define IMAGE_SUBSYSTEM_POSIX_CUI 7 /* POSIX CUI subsystem */ +#define IMAGE_SUBSYSTEM_MMOSA 8 /* MMOSA/Native Win32E */ +#define IMAGE_SUBSYSTEM_WINDOWS_CE_GUI 9 /* Windows CE subsystem */ +#define IMAGE_SUBSYSTEM_EFI_APPLICATION 10 /* Extensible Firmware Interface (EFI) application */ +#define IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER 11 /* EFI driver with boot services */ +#define IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER 12 /* EFI driver with run-time services */ +#define IMAGE_SUBSYSTEM_EFI_ROM_IMAGE 13 /* EFI ROM image */ +#define IMAGE_SUBSYSTEM_XBOX 14 /* Xbox system */ +#define IMAGE_SUBSYSTEM_RESERVED_15 15 +#define IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION 16 /* Windows Boot application */ +#define IMAGE_SUBSYSTEM_XBOX_CODE_CATALOG 17 /* Xbox Code Catalog */ + +/* dll_flags */ +#define IMAGE_LIBRARY_PROCESS_INIT 0x0001 /* DLL initialization function called just after process initialization */ +#define IMAGE_LIBRARY_PROCESS_TERM 0x0002 /* DLL initialization function called just before process termination */ +#define IMAGE_LIBRARY_THREAD_INIT 0x0004 /* DLL initialization function called just after thread initialization */ +#define IMAGE_LIBRARY_THREAD_TERM 0x0008 /* DLL initialization function called just before thread initialization */ +#define IMAGE_DLLCHARACTERISTICS_RESERVED_4 0x0010 +#define IMAGE_DLLCHARACTERISTICS_HIGH_ENTROPY_VA 0x0020 /* ASLR with 64 bit address space (image can be loaded at address above 4GB) */ +#define IMAGE_DLLCHARACTERISTICS_DYNAMIC_BASE 0x0040 /* The DLL can be relocated at load time */ +#define IMAGE_DLLCHARACTERISTICS_FORCE_INTEGRITY 0x0080 /* Code integrity checks are forced */ +#define IMAGE_DLLCHARACTERISTICS_NX_COMPAT 0x0100 /* Image is compatible with data execution prevention */ +#define IMAGE_DLLCHARACTERISTICS_NO_ISOLATION 0x0200 /* Image is isolation aware, but should not be isolated (prevents loading of manifest file) */ +#define IMAGE_DLLCHARACTERISTICS_NO_SEH 0x0400 /* Image does not use SEH, no SE handler may reside in this image */ +#define IMAGE_DLLCHARACTERISTICS_NO_BIND 0x0800 /* Do not bind the image */ +#define IMAGE_DLLCHARACTERISTICS_X86_THUNK 0x1000 /* Image is a Wx86 Thunk DLL (for non-x86/risc DLL files) */ +#define IMAGE_DLLCHARACTERISTICS_APPCONTAINER 0x1000 /* Image should execute in an AppContainer (for EXE Metro Apps in Windows 8) */ +#define IMAGE_DLLCHARACTERISTICS_WDM_DRIVER 0x2000 /* A WDM driver */ +#define IMAGE_DLLCHARACTERISTICS_GUARD_CF 0x4000 /* Image supports Control Flow Guard */ +#define IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE 0x8000 /* The image is terminal server (Remote Desktop Services) aware */ + +/* IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS flags */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_COMPAT 0x0001 /* Image is Control-flow Enforcement Technology Shadow Stack compatible */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_COMPAT_STRICT_MODE 0x0002 /* CET is enforced in strict mode */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_SET_CONTEXT_IP_VALIDATION_RELAXED_MODE 0x0004 /* Relaxed mode for Context IP Validation under CET is allowed */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_DYNAMIC_APIS_ALLOW_IN_PROC 0x0008 /* Use of dynamic APIs is restricted to processes only */ +#define IMAGE_DLLCHARACTERISTICS_EX_CET_RESERVED_1 0x0010 +#define IMAGE_DLLCHARACTERISTICS_EX_CET_RESERVED_2 0x0020 +#define IMAGE_DLLCHARACTERISTICS_EX_FORWARD_CFI_COMPAT 0x0040 /* All branch targets in all image code sections are annotated with forward-edge control flow integrity guard instructions */ +#define IMAGE_DLLCHARACTERISTICS_EX_HOTPATCH_COMPATIBLE 0x0080 /* Image can be modified while in use, hotpatch-compatible */ + +/* section_header flags */ +#define IMAGE_SCN_SCALE_INDEX 0x00000001 /* address of tls index is scaled = multiplied by 4 (for .tls section on MIPS only) */ +#define IMAGE_SCN_TYPE_NO_LOAD 0x00000002 /* reserved */ +#define IMAGE_SCN_TYPE_GROUPED 0x00000004 /* obsolete (used for 16-bit offset code) */ +#define IMAGE_SCN_TYPE_NO_PAD 0x00000008 /* .o only - don't pad - obsolete (same as IMAGE_SCN_ALIGN_1BYTES) */ +#define IMAGE_SCN_TYPE_COPY 0x00000010 /* reserved */ #define IMAGE_SCN_CNT_CODE 0x00000020 /* .text */ #define IMAGE_SCN_CNT_INITIALIZED_DATA 0x00000040 /* .data */ #define IMAGE_SCN_CNT_UNINITIALIZED_DATA 0x00000080 /* .bss */ -#define IMAGE_SCN_LNK_OTHER 0x00000100 /* reserved */ -#define IMAGE_SCN_LNK_INFO 0x00000200 /* .drectve comments */ -#define IMAGE_SCN_RESERVED_4 0x00000400 +#define IMAGE_SCN_LNK_OTHER 0x00000100 /* .o only - other type than code, data or info */ +#define IMAGE_SCN_LNK_INFO 0x00000200 /* .o only - .drectve comments */ +#define IMAGE_SCN_LNK_OVERLAY 0x00000400 /* section contains overlay */ #define IMAGE_SCN_LNK_REMOVE 0x00000800 /* .o only - scn to be rm'd*/ #define IMAGE_SCN_LNK_COMDAT 0x00001000 /* .o only - COMDAT data */ -#define IMAGE_SCN_RESERVED_5 0x00002000 /* spec omits this */ -#define IMAGE_SCN_RESERVED_6 0x00004000 /* spec omits this */ -#define IMAGE_SCN_GPREL 0x00008000 /* global pointer referenced data */ -/* spec lists 0x20000 twice, I suspect they meant 0x10000 for one of them */ -#define IMAGE_SCN_MEM_PURGEABLE 0x00010000 /* reserved for "future" use */ -#define IMAGE_SCN_16BIT 0x00020000 /* reserved for "future" use */ -#define IMAGE_SCN_LOCKED 0x00040000 /* reserved for "future" use */ -#define IMAGE_SCN_PRELOAD 0x00080000 /* reserved for "future" use */ +#define IMAGE_SCN_RESERVED_13 0x00002000 /* spec omits this */ +#define IMAGE_SCN_MEM_PROTECTED 0x00004000 /* section is memory protected (for M68K) */ +#define IMAGE_SCN_NO_DEFER_SPEC_EXC 0x00004000 /* reset speculative exceptions handling bits in the TLB entries (for non-M68K) */ +#define IMAGE_SCN_MEM_FARDATA 0x00008000 /* section uses FAR_EXTERNAL relocations (for M68K) */ +#define IMAGE_SCN_GPREL 0x00008000 /* global pointer referenced data (for non-M68K) */ +#define IMAGE_SCN_MEM_SYSHEAP 0x00010000 /* use system heap (for M68K) */ +#define IMAGE_SCN_MEM_PURGEABLE 0x00020000 /* section can be released from RAM (for M68K) */ +#define IMAGE_SCN_MEM_16BIT 0x00020000 /* section is 16-bit (for non-M68K where it makes sense: I386, THUMB, MIPS16, MIPSFPU16, ...) */ +#define IMAGE_SCN_MEM_LOCKED 0x00040000 /* prevent the section from being moved (for M68K and .o I386) */ +#define IMAGE_SCN_MEM_PRELOAD 0x00080000 /* section is preload to RAM (for M68K and .o I386) */ /* and here they just stuck a 1-byte integer in the middle of a bitfield */ -#define IMAGE_SCN_ALIGN_1BYTES 0x00100000 /* it does what it says on the box */ +#define IMAGE_SCN_ALIGN_1BYTES 0x00100000 /* .o only - it does what it says on the box */ #define IMAGE_SCN_ALIGN_2BYTES 0x00200000 #define IMAGE_SCN_ALIGN_4BYTES 0x00300000 #define IMAGE_SCN_ALIGN_8BYTES 0x00400000 @@ -159,7 +206,9 @@ #define IMAGE_SCN_ALIGN_2048BYTES 0x00c00000 #define IMAGE_SCN_ALIGN_4096BYTES 0x00d00000 #define IMAGE_SCN_ALIGN_8192BYTES 0x00e00000 -#define IMAGE_SCN_LNK_NRELOC_OVFL 0x01000000 /* extended relocations */ +#define IMAGE_SCN_ALIGN_RESERVED 0x00f00000 +#define IMAGE_SCN_ALIGN_MASK 0x00f00000 +#define IMAGE_SCN_LNK_NRELOC_OVFL 0x01000000 /* .o only - extended relocations */ #define IMAGE_SCN_MEM_DISCARDABLE 0x02000000 /* scn can be discarded */ #define IMAGE_SCN_MEM_NOT_CACHED 0x04000000 /* cannot be cached */ #define IMAGE_SCN_MEM_NOT_PAGED 0x08000000 /* not pageable */ @@ -168,8 +217,28 @@ #define IMAGE_SCN_MEM_READ 0x40000000 /* readable */ #define IMAGE_SCN_MEM_WRITE 0x80000000 /* writeable */ -#define IMAGE_DEBUG_TYPE_CODEVIEW 2 -#define IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS 20 +#define IMAGE_DEBUG_TYPE_UNKNOWN 0 /* Unknown value, ignored by all tools */ +#define IMAGE_DEBUG_TYPE_COFF 1 /* COFF debugging information */ +#define IMAGE_DEBUG_TYPE_CODEVIEW 2 /* CodeView debugging information or Visual C++ Program Database debugging information */ +#define IMAGE_DEBUG_TYPE_FPO 3 /* Frame pointer omission (FPO) information */ +#define IMAGE_DEBUG_TYPE_MISC 4 /* Location of DBG file with CodeView debugging information */ +#define IMAGE_DEBUG_TYPE_EXCEPTION 5 /* Exception information, copy of .pdata section */ +#define IMAGE_DEBUG_TYPE_FIXUP 6 /* Fixup information */ +#define IMAGE_DEBUG_TYPE_OMAP_TO_SRC 7 /* The mapping from an RVA in image to an RVA in source image */ +#define IMAGE_DEBUG_TYPE_OMAP_FROM_SRC 8 /* The mapping from an RVA in source image to an RVA in image */ +#define IMAGE_DEBUG_TYPE_BORLAND 9 /* Borland debugging information */ +#define IMAGE_DEBUG_TYPE_RESERVED10 10 /* Coldpath / Hotpatch debug information */ +#define IMAGE_DEBUG_TYPE_CLSID 11 /* CLSID */ +#define IMAGE_DEBUG_TYPE_VC_FEATURE 12 /* Visual C++ counts / statistics */ +#define IMAGE_DEBUG_TYPE_POGO 13 /* COFF group information, data for profile-guided optimization */ +#define IMAGE_DEBUG_TYPE_ILTCG 14 /* Incremental link-time code generation */ +#define IMAGE_DEBUG_TYPE_MPX 15 /* Intel Memory Protection Extensions */ +#define IMAGE_DEBUG_TYPE_REPRO 16 /* PE determinism or reproducibility */ +#define IMAGE_DEBUG_TYPE_EMBEDDED_PORTABLE_PDB 17 /* Embedded Portable PDB debugging information */ +#define IMAGE_DEBUG_TYPE_SPGO 18 /* Sample profile-guided optimization */ +#define IMAGE_DEBUG_TYPE_PDBCHECKSUM 19 /* PDB Checksum */ +#define IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS 20 /* Extended DLL characteristics bits */ +#define IMAGE_DEBUG_TYPE_PERFMAP 21 /* Location of associated Ready To Run PerfMap file */ #ifndef __ASSEMBLY__ @@ -235,7 +304,7 @@ struct pe32_opt_hdr { uint16_t image_minor; /* minor image version */ uint16_t subsys_major; /* major subsystem version */ uint16_t subsys_minor; /* minor subsystem version */ - uint32_t win32_version; /* reserved, must be 0 */ + uint32_t win32_version; /* win32 version reported at runtime */ uint32_t image_size; /* image size */ uint32_t header_size; /* header size rounded up to file_align */ @@ -246,7 +315,7 @@ struct pe32_opt_hdr { uint32_t stack_size; /* amt of stack required */ uint32_t heap_size_req; /* amt of heap requested */ uint32_t heap_size; /* amt of heap required */ - uint32_t loader_flags; /* reserved, must be 0 */ + uint32_t loader_flags; /* loader flags */ uint32_t data_dirs; /* number of data dir entries */ }; @@ -269,7 +338,7 @@ struct pe32plus_opt_hdr { uint16_t image_minor; /* minor image version */ uint16_t subsys_major; /* major subsystem version */ uint16_t subsys_minor; /* minor subsystem version */ - uint32_t win32_version; /* reserved, must be 0 */ + uint32_t win32_version; /* win32 version reported at runtime */ uint32_t image_size; /* image size */ uint32_t header_size; /* header size rounded up to file_align */ @@ -280,7 +349,7 @@ struct pe32plus_opt_hdr { uint64_t stack_size; /* amt of stack required */ uint64_t heap_size_req; /* amt of heap requested */ uint64_t heap_size; /* amt of heap required */ - uint32_t loader_flags; /* reserved, must be 0 */ + uint32_t loader_flags; /* loader flags */ uint32_t data_dirs; /* number of data dir entries */ }; @@ -301,10 +370,10 @@ struct data_directory { struct data_dirent global_ptr; /* global pointer reg. Size=0 */ struct data_dirent tls; /* .tls */ struct data_dirent load_config; /* load configuration structure */ - struct data_dirent bound_imports; /* no idea */ + struct data_dirent bound_imports; /* bound import table */ struct data_dirent import_addrs; /* import address table */ struct data_dirent delay_imports; /* delay-load import table */ - struct data_dirent clr_runtime_hdr; /* .cor (object only) */ + struct data_dirent clr_runtime_hdr; /* .cor (clr/.net executables) */ struct data_dirent reserved; }; -- cgit v1.2.3 From e341f9c3c8412e57fe0042a33a2640245ecdf619 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Mon, 5 May 2025 11:23:28 -0700 Subject: mm/mempolicy: Weighted Interleave Auto-tuning On machines with multiple memory nodes, interleaving page allocations across nodes allows for better utilization of each node's bandwidth. Previous work by Gregory Price [1] introduced weighted interleave, which allowed for pages to be allocated across nodes according to user-set ratios. Ideally, these weights should be proportional to their bandwidth, so that under bandwidth pressure, each node uses its maximal efficient bandwidth and prevents latency from increasing exponentially. Previously, weighted interleave's default weights were just 1s -- which would be equivalent to the (unweighted) interleave mempolicy, which goes through the nodes in a round-robin fashion, ignoring bandwidth information. This patch has two main goals: First, it makes weighted interleave easier to use for users who wish to relieve bandwidth pressure when using nodes with varying bandwidth (CXL). By providing a set of "real" default weights that just work out of the box, users who might not have the capability (or wish to) perform experimentation to find the most optimal weights for their system can still take advantage of bandwidth-informed weighted interleave. Second, it allows for weighted interleave to dynamically adjust to hotplugged memory with new bandwidth information. Instead of manually updating node weights every time new bandwidth information is reported or taken off, weighted interleave adjusts and provides a new set of default weights for weighted interleave to use when there is a change in bandwidth information. To meet these goals, this patch introduces an auto-configuration mode for the interleave weights that provides a reasonable set of default weights, calculated using bandwidth data reported by the system. In auto mode, weights are dynamically adjusted based on whatever the current bandwidth information reports (and responds to hotplug events). This patch still supports users manually writing weights into the nodeN sysfs interface by entering into manual mode. When a user enters manual mode, the system stops dynamically updating any of the node weights, even during hotplug events that shift the optimal weight distribution. A new sysfs interface "auto" is introduced, which allows users to switch between the auto (writing 1 or Y) and manual (writing 0 or N) modes. The system also automatically enters manual mode when a nodeN interface is manually written to. There is one functional change that this patch makes to the existing weighted_interleave ABI: previously, writing 0 directly to a nodeN interface was said to reset the weight to the system default. Before this patch, the default for all weights were 1, which meant that writing 0 and 1 were functionally equivalent. With this patch, writing 0 is invalid. Link: https://lkml.kernel.org/r/20250520141236.2987309-1-joshua.hahnjy@gmail.com [joshua.hahnjy@gmail.com: wordsmithing changes, simplification, fixes] Link: https://lkml.kernel.org/r/20250511025840.2410154-1-joshua.hahnjy@gmail.com [joshua.hahnjy@gmail.com: remove auto_kobj_attr field from struct sysfs_wi_group] Link: https://lkml.kernel.org/r/20250512142511.3959833-1-joshua.hahnjy@gmail.com https://lore.kernel.org/linux-mm/20240202170238.90004-1-gregory.price@memverge.com/ [1] Link: https://lkml.kernel.org/r/20250505182328.4148265-1-joshua.hahnjy@gmail.com Co-developed-by: Gregory Price Signed-off-by: Gregory Price Signed-off-by: Joshua Hahn Suggested-by: Yunjeong Mun Suggested-by: Oscar Salvador Suggested-by: Ying Huang Suggested-by: Harry Yoo Reviewed-by: Harry Yoo Reviewed-by: Huang Ying Reviewed-by: Honggyu Kim Cc: Dan Williams Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Joanthan Cameron Cc: Johannes Weiner Cc: Len Brown Signed-off-by: Andrew Morton --- include/linux/mempolicy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ce9885e0178a..0fe96f3ab3ef 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -178,6 +179,9 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); +extern int mempolicy_set_node_perf(unsigned int node, + struct access_coordinate *coords); + #else struct mempolicy {}; -- cgit v1.2.3 From bf454ec31add6790f6cdc88328e38901fcbbade6 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:35 +0800 Subject: kexec_file: allow to place kexec_buf randomly Patch series "Support kdump with LUKS encryption by reusing LUKS volume keys", v9. LUKS is the standard for Linux disk encryption, widely adopted by users, and in some cases, such as Confidential VMs, it is a requirement. With kdump enabled, when the first kernel crashes, the system can boot into the kdump/crash kernel to dump the memory image (i.e., /proc/vmcore) to a specified target. However, there are two challenges when dumping vmcore to a LUKS-encrypted device: - Kdump kernel may not be able to decrypt the LUKS partition. For some machines, a system administrator may not have a chance to enter the password to decrypt the device in kdump initramfs after the 1st kernel crashes; For cloud confidential VMs, depending on the policy the kdump kernel may not be able to unseal the keys with TPM and the console virtual keyboard is untrusted. - LUKS2 by default use the memory-hard Argon2 key derivation function which is quite memory-consuming compared to the limited memory reserved for kdump. Take Fedora example, by default, only 256M is reserved for systems having memory between 4G-64G. With LUKS enabled, ~1300M needs to be reserved for kdump. Note if the memory reserved for kdump can't be used by 1st kernel i.e. an user sees ~1300M memory missing in the 1st kernel. Besides users (at least for Fedora) usually expect kdump to work out of the box i.e. no manual password input or custom crashkernel value is needed. And it doesn't make sense to derivate the keys again in kdump kernel which seems to be redundant work. This patchset addresses the above issues by making the LUKS volume keys persistent for kdump kernel with the help of cryptsetup's new APIs (--link-vk-to-keyring/--volume-key-keyring). Here is the life cycle of the kdump copies of LUKS volume keys, 1. After the 1st kernel loads the initramfs during boot, systemd use an user-input passphrase to de-crypt the LUKS volume keys or TPM-sealed key and then save the volume keys to specified keyring (using the --link-vk-to-keyring API) and the key will expire within specified time. 2. A user space tool (kdump initramfs loader like kdump-utils) create key items inside /sys/kernel/config/crash_dm_crypt_keys to inform the 1st kernel which keys are needed. 3. When the kdump initramfs is loaded by the kexec_file_load syscall, the 1st kernel will iterate created key items, save the keys to kdump reserved memory. 4. When the 1st kernel crashes and the kdump initramfs is booted, the kdump initramfs asks the kdump kernel to create a user key using the key stored in kdump reserved memory by writing yes to /sys/kernel/crash_dm_crypt_keys/restore. Then the LUKS encrypted device is unlocked with libcryptsetup's --volume-key-keyring API. 5. The system gets rebooted to the 1st kernel after dumping vmcore to the LUKS encrypted device is finished After libcryptsetup saving the LUKS volume keys to specified keyring, whoever takes this should be responsible for the safety of these copies of keys. The keys will be saved in the memory area exclusively reserved for kdump where even the 1st kernel has no direct access. And further more, two additional protections are added, - save the copy randomly in kdump reserved memory as suggested by Jan - clear the _PAGE_PRESENT flag of the page that stores the copy as suggested by Pingfan This patchset only supports x86. There will be patches to support other architectures once this patch set gets merged. This patch (of 9): Currently, kexec_buf is placed in order which means for the same machine, the info in the kexec_buf is always located at the same position each time the machine is booted. This may cause a risk for sensitive information like LUKS volume key. Now struct kexec_buf has a new field random which indicates it's supposed to be placed in a random position. Note this feature is enabled only when CONFIG_CRASH_DUMP is enabled. So it only takes effect for kdump and won't impact kexec reboot. Link: https://lkml.kernel.org/r/20250502011246.99238-1-coxu@redhat.com Link: https://lkml.kernel.org/r/20250502011246.99238-2-coxu@redhat.com Signed-off-by: Coiby Xu Suggested-by: Jan Pazdziora Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- include/linux/kexec.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index c8971861521a..1871eaa95432 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -25,6 +25,10 @@ extern note_buf_t __percpu *crash_notes; +#ifdef CONFIG_CRASH_DUMP +#include +#endif + #ifdef CONFIG_KEXEC_CORE #include #include @@ -169,6 +173,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image); * @buf_min: The buffer can't be placed below this address. * @buf_max: The buffer can't be placed above this address. * @top_down: Allocate from top of memory. + * @random: Place the buffer at a random position. */ struct kexec_buf { struct kimage *image; @@ -180,8 +185,33 @@ struct kexec_buf { unsigned long buf_min; unsigned long buf_max; bool top_down; +#ifdef CONFIG_CRASH_DUMP + bool random; +#endif }; + +#ifdef CONFIG_CRASH_DUMP +static inline void kexec_random_range_start(unsigned long start, + unsigned long end, + struct kexec_buf *kbuf, + unsigned long *temp_start) +{ + unsigned short i; + + if (kbuf->random) { + get_random_bytes(&i, sizeof(unsigned short)); + *temp_start = start + (end - start) / USHRT_MAX * i; + } +} +#else +static inline void kexec_random_range_start(unsigned long start, + unsigned long end, + struct kexec_buf *kbuf, + unsigned long *temp_start) +{} +#endif + int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf); int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, -- cgit v1.2.3 From 479e58549b0fa7e80f1e0b9e69e0a2a8e6711132 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:37 +0800 Subject: crash_dump: store dm crypt keys in kdump reserved memory When the kdump kernel image and initrd are loaded, the dm crypts keys will be read from keyring and then stored in kdump reserved memory. Assume a key won't exceed 256 bytes thus MAX_KEY_SIZE=256 according to "cryptsetup benchmark". Link: https://lkml.kernel.org/r/20250502011246.99238-4-coxu@redhat.com Signed-off-by: Coiby Xu Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Jan Pazdziora Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 6 +++++- include/linux/kexec.h | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 44305336314e..2e6782239034 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -34,7 +34,11 @@ static inline void arch_kexec_protect_crashkres(void) { } static inline void arch_kexec_unprotect_crashkres(void) { } #endif - +#ifdef CONFIG_CRASH_DM_CRYPT +int crash_load_dm_crypt_keys(struct kimage *image); +#else +static inline int crash_load_dm_crypt_keys(struct kimage *image) {return 0; } +#endif #ifndef arch_crash_handle_hotplug_event static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { } diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 1871eaa95432..6e688c5d8e4d 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -405,6 +405,10 @@ struct kimage { void *elf_headers; unsigned long elf_headers_sz; unsigned long elf_load_addr; + + /* dm crypt keys buffer */ + unsigned long dm_crypt_keys_addr; + unsigned long dm_crypt_keys_sz; }; /* kexec interface functions */ -- cgit v1.2.3 From 62f17d9df6924cf805de5ae970470615c1c8d9f2 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:39 +0800 Subject: crash_dump: retrieve dm crypt keys in kdump kernel Crash kernel will retrieve the dm crypt keys based on the dmcryptkeys command line parameter. When user space writes the key description to /sys/kernel/config/crash_dm_crypt_key/restore, the crash kernel will save the encryption keys to the user keyring. Then user space e.g. cryptsetup's --volume-key-keyring API can use it to unlock the encrypted device. Link: https://lkml.kernel.org/r/20250502011246.99238-6-coxu@redhat.com Signed-off-by: Coiby Xu Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Jan Pazdziora Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 1 + include/linux/crash_dump.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 2e6782239034..d35726d6a415 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -36,6 +36,7 @@ static inline void arch_kexec_unprotect_crashkres(void) { } #ifdef CONFIG_CRASH_DM_CRYPT int crash_load_dm_crypt_keys(struct kimage *image); +ssize_t dm_crypt_keys_read(char *buf, size_t count, u64 *ppos); #else static inline int crash_load_dm_crypt_keys(struct kimage *image) {return 0; } #endif diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 2f2555e6407c..dd6fc3b2133b 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -15,6 +15,8 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; +extern unsigned long long dm_crypt_keys_addr; + #ifdef CONFIG_CRASH_DUMP extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); extern void elfcorehdr_free(unsigned long long addr); -- cgit v1.2.3 From 4e7bca76e3fed58437dcd7f747d1c8d98507379e Mon Sep 17 00:00:00 2001 From: Hans Zhang Date: Sat, 17 May 2025 00:52:22 +0800 Subject: PCI/MSI: Use bool for MSI enable state tracking Convert pci_msi_enable and pci_msi_enabled() to use bool type for clarity. No functional changes, only code cleanup. Signed-off-by: Hans Zhang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250516165223.125083-2-18255117159@163.com --- include/linux/pci.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..f5e908a56498 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1669,7 +1669,7 @@ void pci_disable_msi(struct pci_dev *dev); int pci_msix_vec_count(struct pci_dev *dev); void pci_disable_msix(struct pci_dev *dev); void pci_restore_msi_state(struct pci_dev *dev); -int pci_msi_enabled(void); +bool pci_msi_enabled(void); int pci_enable_msi(struct pci_dev *dev); int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, int minvec, int maxvec); @@ -1702,7 +1702,7 @@ static inline void pci_disable_msi(struct pci_dev *dev) { } static inline int pci_msix_vec_count(struct pci_dev *dev) { return -ENOSYS; } static inline void pci_disable_msix(struct pci_dev *dev) { } static inline void pci_restore_msi_state(struct pci_dev *dev) { } -static inline int pci_msi_enabled(void) { return 0; } +static inline bool pci_msi_enabled(void) { return false; } static inline int pci_enable_msi(struct pci_dev *dev) { return -ENOSYS; } static inline int pci_enable_msix_range(struct pci_dev *dev, -- cgit v1.2.3 From a5bd029c733b8ae790d5873e2afeb88b58e3a151 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:04 -0700 Subject: net: add skb_crc32c() Add skb_crc32c(), which calculates the CRC32C of a sk_buff. It will replace __skb_checksum(), which unnecessarily supports arbitrary checksums. Compared to __skb_checksum(), skb_crc32c(): - Uses the correct type for CRC32C values (u32, not __wsum). - Does not require the caller to provide a skb_checksum_ops struct. - Is faster because it does not use indirect calls and does not use the very slow crc32c_combine(). According to commit 2817a336d4d5 ("net: skb_checksum: allow custom update/combine for walking skb") which added __skb_checksum(), the original motivation for the abstraction layer was to avoid code duplication for CRC32C and other checksums in the future. However: - No additional checksums showed up after CRC32C. __skb_checksum() is only used with the "regular" net checksum and CRC32C. - Indirect calls are expensive. Commit 2544af0344ba ("net: avoid indirect calls in L4 checksum calculation") worked around this using the INDIRECT_CALL_1 macro. But that only avoided the indirect call for the net checksum, and at the cost of an extra branch. - The checksums use different types (__wsum and u32), causing casts to be needed. - It made the checksums of fragments be combined (rather than chained) for both checksums, despite this being highly counterproductive for CRC32C due to how slow crc32c_combine() is. This can clearly be seen in commit 4c2f24549644 ("sctp: linearize early if it's not GSO") which tried to work around this performance bug. With a dedicated function for each checksum, we can instead just use the proper strategy for each checksum. As shown by the following tables, the new function skb_crc32c() is faster than __skb_checksum(), with the improvement varying greatly from 5% to 2500% depending on the case. The largest improvements come from fragmented packets, mainly due to eliminating the inefficient crc32c_combine(). But linear packets are improved too, especially shorter ones, mainly due to eliminating indirect calls. These benchmarks were done on AMD Zen 5. On that CPU, Linux uses IBRS instead of retpoline; an even greater improvement might be seen with retpoline: Linear sk_buffs Length in bytes __skb_checksum cycles skb_crc32c cycles =============== ===================== ================= 64 43 18 256 94 77 1420 204 161 16384 1735 1642 Nonlinear sk_buffs (even split between head and one fragment) Length in bytes __skb_checksum cycles skb_crc32c cycles =============== ===================== ================= 64 579 22 256 829 77 1420 1506 194 16384 4365 1682 Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-3-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c7397b17bb08..7ccc6356acac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4203,6 +4203,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); +u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc); static inline void * __must_check __skb_header_pointer(const struct sk_buff *skb, int offset, int len, -- cgit v1.2.3 From 70c96c7cb9f035d5b960021f2450afa6240e66b4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:08 -0700 Subject: net: fold __skb_checksum() into skb_checksum() Now that the only remaining caller of __skb_checksum() is skb_checksum(), fold __skb_checksum() into skb_checksum(). This makes struct skb_checksum_ops unnecessary, so remove that too and simply do the "regular" net checksum. It also makes the wrapper functions csum_partial_ext() and csum_block_add_ext() unnecessary, so remove those too and just use the underlying functions. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-7-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7ccc6356acac..018c07230513 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4192,15 +4192,6 @@ static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; } -struct skb_checksum_ops { - __wsum (*update)(const void *mem, int len, __wsum wsum); - __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); -}; - -extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly; - -__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, - __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc); -- cgit v1.2.3 From b82f72292ab4c65250bd734281464a6ab1ff4133 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:09 -0700 Subject: lib/crc32: remove unused support for CRC32C combination crc32c_combine() and crc32c_shift() are no longer used (except by the KUnit test that tests them), and their current implementation is very slow. Remove them. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-8-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/crc32.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 69c2e8bb3782..7f7d0be8a0ac 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -76,29 +76,6 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) return crc32_le_shift(crc1, len2) ^ crc2; } -u32 crc32c_shift(u32 crc, size_t len); - -/** - * crc32c_combine - Combine two crc32c check values into one. For two sequences - * of bytes, seq1 and seq2 with lengths len1 and len2, crc32c() - * check values were calculated for each, crc1 and crc2. - * - * @crc1: crc32c of the first block - * @crc2: crc32c of the second block - * @len2: length of the second block - * - * Return: The crc32c() check value of seq1 and seq2 concatenated, requiring - * only crc1, crc2, and len2. Note: If seq_full denotes the concatenated - * memory area of seq1 with seq2, and crc_full the crc32c() value of - * seq_full, then crc_full == crc32c_combine(crc1, crc2, len2) when - * crc_full was seeded with the same initializer as crc1, and crc2 seed - * was 0. See also crc_combine_test(). - */ -static inline u32 crc32c_combine(u32 crc1, u32 crc2, size_t len2) -{ - return crc32c_shift(crc1, len2) ^ crc2; -} - #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) /* -- cgit v1.2.3 From ea6342d98928e243f2024fb97a9b4d42ee55dfba Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:10 -0700 Subject: net: add skb_copy_and_crc32c_datagram_iter() Since skb_copy_and_hash_datagram_iter() is used only with CRC32C, the crypto_ahash abstraction provides no value. Add skb_copy_and_crc32c_datagram_iter() which just calls crc32c() directly. This is faster and simpler. It also doesn't have the weird dependency issue where skb_copy_and_hash_datagram_iter() depends on CONFIG_CRYPTO_HASH=y without that being expressed explicitly in the kconfig (presumably because it was too heavyweight for NET to select). The new function is conditional on the hidden boolean symbol NET_CRC32C, which selects CRC32. So it gets compiled only when something that actually needs CRC32C packet checksums is enabled, it has no implicit dependency, and it doesn't depend on the heavyweight crypto layer. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-9-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 018c07230513..510adf63c211 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4137,6 +4137,8 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash); +int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, u32 *crcp); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); -- cgit v1.2.3 From c93f75b2d755c35b596084ddd3feb3528284a53f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:12 -0700 Subject: net: remove skb_copy_and_hash_datagram_iter() Now that skb_copy_and_hash_datagram_iter() is no longer used, remove it. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-11-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 510adf63c211..5520524c93bf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -274,7 +274,6 @@ SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) -struct ahash_request; struct net_device; struct scatterlist; struct pipe_inode_info; @@ -4134,9 +4133,6 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, } int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg); -int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len, - struct ahash_request *hash); int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, u32 *crcp); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, -- cgit v1.2.3 From 31afd6bc55cc0093c3e5b0a368319e423d4de8ea Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 17 May 2025 22:13:45 +0200 Subject: net: phy: pass PHY driver to .match_phy_device OP Pass PHY driver pointer to .match_phy_device OP in addition to phydev. Having access to the PHY driver struct might be useful to check the PHY ID of the driver is being matched for in case the PHY ID scanned in the phydev is not consistent. A scenario for this is a PHY that change PHY ID after a firmware is loaded, in such case, the PHY ID stored in PHY device struct is not valid anymore and PHY will manually scan the ID in the match_phy_device function. Having the PHY driver info is also useful for those PHY driver that implement multiple simple .match_phy_device OP to match specific MMD PHY ID. With this extra info if the parsing logic is the same, the matching function can be generalized by using the phy_id in the PHY driver instead of hardcoding. Rust wrapper callback is updated to align to the new match_phy_device arguments. Suggested-by: Russell King (Oracle) Reviewed-by: Russell King (Oracle) Signed-off-by: Christian Marangi Reviewed-by: Benno Lossin # for Rust Reviewed-by: FUJITA Tomonori Link: https://patch.msgid.link/20250517201353.5137-2-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 92a88b5ce356..10e66d45a8e8 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -990,7 +990,8 @@ struct phy_driver { * driver for the given phydev. If NULL, matching is based on * phy_id and phy_id_mask. */ - int (*match_phy_device)(struct phy_device *phydev); + int (*match_phy_device)(struct phy_device *phydev, + const struct phy_driver *phydrv); /** * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY -- cgit v1.2.3 From d6c45707ac84c2d9f274ece1cea4dddb97996bde Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 17 May 2025 22:13:48 +0200 Subject: net: phy: introduce genphy_match_phy_device() Introduce new API, genphy_match_phy_device(), to provide a way to check to match a PHY driver for a PHY device based on the info stored in the PHY device struct. The function generalize the logic used in phy_bus_match() to check the PHY ID whether if C45 or C22 ID should be used for matching. This is useful for custom .match_phy_device function that wants to use the generic logic under some condition. (example a PHY is already setup and provide the correct PHY ID) Reviewed-by: Russell King (Oracle) Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250517201353.5137-5-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 10e66d45a8e8..32b9da274115 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1868,6 +1868,9 @@ char *phy_attached_info_irq(struct phy_device *phydev) __malloc; void phy_attached_info(struct phy_device *phydev); +int genphy_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv); + /* Clause 22 PHY */ int genphy_read_abilities(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); -- cgit v1.2.3 From 18355307dc56c198365c6b6b359a4a24db013685 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 7 May 2025 15:19:13 +0300 Subject: i2c: atr: add static flag Some I2C ATRs do not support dynamic remapping, only static mapping of direct children. Mappings will only be added or removed as a result of devices being added or removed from a child bus. The ATR pool will have to be big enough to accommodate all devices expected to be added to the child buses. Add a new flag that prevents old mappings to be replaced or new mappings to be created in the alias finding code paths. That mens adding a flags parameter to i2c_atr_new() and an i2c_atr_flags enum. Signed-off-by: Cosmin Tanislav Reviewed-by: Luca Ceresoli Reviewed-by: Romain Gantois Signed-off-by: Wolfram Sang --- include/linux/i2c-atr.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c-atr.h b/include/linux/i2c-atr.h index 1c3a5bcd939f..5aaab1598084 100644 --- a/include/linux/i2c-atr.h +++ b/include/linux/i2c-atr.h @@ -18,6 +18,19 @@ struct device; struct fwnode_handle; struct i2c_atr; +/** + * enum i2c_atr_flags - Flags for an I2C ATR driver + * + * @I2C_ATR_F_STATIC: ATR does not support dynamic mapping, use static mapping. + * Mappings will only be added or removed as a result of + * devices being added or removed from a child bus. + * The ATR pool will have to be big enough to accomodate all + * devices expected to be added to the child buses. + */ +enum i2c_atr_flags { + I2C_ATR_F_STATIC = BIT(0), +}; + /** * struct i2c_atr_ops - Callbacks from ATR to the device driver. * @attach_addr: Notify the driver of a new device connected on a child @@ -65,6 +78,7 @@ struct i2c_atr_adap_desc { * @dev: The device acting as an ATR * @ops: Driver-specific callbacks * @max_adapters: Maximum number of child adapters + * @flags: Flags for ATR * * The new ATR helper is connected to the parent adapter but has no child * adapters. Call i2c_atr_add_adapter() to add some. @@ -74,7 +88,8 @@ struct i2c_atr_adap_desc { * Return: pointer to the new ATR helper object, or ERR_PTR */ struct i2c_atr *i2c_atr_new(struct i2c_adapter *parent, struct device *dev, - const struct i2c_atr_ops *ops, int max_adapters); + const struct i2c_atr_ops *ops, int max_adapters, + u32 flags); /** * i2c_atr_delete - Delete an I2C ATR helper. -- cgit v1.2.3 From 17a3a30e8e3df77d1d1f5bc705b903604a1eedc9 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Wed, 7 May 2025 15:19:15 +0300 Subject: i2c: atr: add passthrough flag Some I2C ATRs can have other I2C ATRs as children. The I2C messages of the child ATRs need to be forwarded as-is if the parent I2C ATR can only do static mapping. In the case of GMSL, the deserializer I2C ATR actually doesn't have I2C address remapping hardware capabilities, but it is able to select which GMSL link to talk to, allowing it to change the address of the serializer. The child ATRs need to have their alias pools defined in such a way to prevent overlapping addresses between them, but there's no way around this without orchestration between multiple ATR instances. To allow for this use-case, add a flag that allows unmapped addresses to be passed through, since they are already remapped by the child ATRs. There's no case where an address that has not been remapped by the child ATR will hit the parent ATR. Signed-off-by: Cosmin Tanislav Reviewed-by: Luca Ceresoli Reviewed-by: Romain Gantois Signed-off-by: Wolfram Sang --- include/linux/i2c-atr.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c-atr.h b/include/linux/i2c-atr.h index 5aaab1598084..2bb54dc87c8e 100644 --- a/include/linux/i2c-atr.h +++ b/include/linux/i2c-atr.h @@ -26,9 +26,11 @@ struct i2c_atr; * devices being added or removed from a child bus. * The ATR pool will have to be big enough to accomodate all * devices expected to be added to the child buses. + * @I2C_ATR_F_PASSTHROUGH: Allow unmapped incoming addresses to pass through */ enum i2c_atr_flags { I2C_ATR_F_STATIC = BIT(0), + I2C_ATR_F_PASSTHROUGH = BIT(1), }; /** -- cgit v1.2.3 From 6adf48a3aa316ce360e02dd10222e96da9a0eff5 Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Thu, 15 May 2025 16:16:30 +0200 Subject: mfd: bcm590xx: Add support for multiple device types + BCM59054 compatible The BCM59054 is another chip from the BCM590xx line of PMUs, commonly used on devices with the BCM21664/BCM23550 chipsets. Prepare the BCM590xx driver for supporting other devices by adding the PMUID register values for supported chip types and store them in the MFD data struct as "pmu_id". (These will be checked against the actual PMUID register values in a later commit.) Then, add a DT compatible for the BCM59054, and provide the PMU ID as OF match data. Signed-off-by: Artur Weber Reviewed-by: Stanislav Jakubek Link: https://lore.kernel.org/r/20250515-bcm59054-v9-3-14ba0ea2ea5b@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/bcm590xx.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/bcm590xx.h b/include/linux/mfd/bcm590xx.h index 6b8791da6119..76c30e629333 100644 --- a/include/linux/mfd/bcm590xx.h +++ b/include/linux/mfd/bcm590xx.h @@ -13,6 +13,10 @@ #include #include +/* PMU ID register values; also used as device type */ +#define BCM590XX_PMUID_BCM59054 0x54 +#define BCM590XX_PMUID_BCM59056 0x56 + /* max register address */ #define BCM590XX_MAX_REGISTER_PRI 0xe7 #define BCM590XX_MAX_REGISTER_SEC 0xf0 @@ -24,6 +28,9 @@ struct bcm590xx { struct regmap *regmap_pri; struct regmap *regmap_sec; unsigned int id; + + /* PMU ID value; also used as device type */ + u8 pmu_id; }; #endif /* __LINUX_MFD_BCM590XX_H */ -- cgit v1.2.3 From d310cdbb4ee6285f374d4dfc32173c35f8a2273e Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Thu, 15 May 2025 16:16:31 +0200 Subject: mfd: bcm590xx: Add PMU ID/revision parsing function The BCM590xx PMUs have two I2C registers for reading the PMU ID and revision. The revision is useful for subdevice drivers, since different revisions may have slight differences in behavior (for example - BCM59054 has different regulator configurations for revision A0 and A1). Check the PMU ID register and make sure it matches the DT compatible. Fetch the digital and analog revision from the PMUREV register so that it can be used in subdevice drivers. Also add some known revision values to bcm590xx.h, for convenience when writing subdevice drivers. Signed-off-by: Artur Weber Reviewed-by: Stanislav Jakubek Link: https://lore.kernel.org/r/20250515-bcm59054-v9-4-14ba0ea2ea5b@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/bcm590xx.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/bcm590xx.h b/include/linux/mfd/bcm590xx.h index 76c30e629333..a54c50b2d2c8 100644 --- a/include/linux/mfd/bcm590xx.h +++ b/include/linux/mfd/bcm590xx.h @@ -17,6 +17,16 @@ #define BCM590XX_PMUID_BCM59054 0x54 #define BCM590XX_PMUID_BCM59056 0x56 +/* Known chip revision IDs */ +#define BCM59054_REV_DIGITAL_A1 1 +#define BCM59054_REV_ANALOG_A1 2 + +#define BCM59056_REV_DIGITAL_A0 1 +#define BCM59056_REV_ANALOG_A0 1 + +#define BCM59056_REV_DIGITAL_B0 2 +#define BCM59056_REV_ANALOG_B0 2 + /* max register address */ #define BCM590XX_MAX_REGISTER_PRI 0xe7 #define BCM590XX_MAX_REGISTER_SEC 0xf0 @@ -31,6 +41,10 @@ struct bcm590xx { /* PMU ID value; also used as device type */ u8 pmu_id; + + /* Chip revision, read from PMUREV reg */ + u8 rev_digital; + u8 rev_analog; }; #endif /* __LINUX_MFD_BCM590XX_H */ -- cgit v1.2.3 From 75dc12b4450269821fca4c8634f5185d28cf2117 Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Thu, 15 May 2025 16:16:33 +0200 Subject: regulator: bcm590xx: Store regulator descriptions in table Instead of filling in the regulator description programatically, store the data in a struct. This will make it a bit nicer to introduce support for other BCM590xx chips besides the BCM59056. To do this, add a new struct type, bcm590xx_reg_data, to store all of the necessary information. Drop the old IS_LDO, IS_GPLDO... macros in favor of the "type" field in this struct. Adapt the old bcm590xx_reg struct to the new types. Signed-off-by: Artur Weber Reviewed-by: Stanislav Jakubek Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20250515-bcm59054-v9-6-14ba0ea2ea5b@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/bcm590xx.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/bcm590xx.h b/include/linux/mfd/bcm590xx.h index a54c50b2d2c8..09d4ae054242 100644 --- a/include/linux/mfd/bcm590xx.h +++ b/include/linux/mfd/bcm590xx.h @@ -27,6 +27,12 @@ #define BCM59056_REV_DIGITAL_B0 2 #define BCM59056_REV_ANALOG_B0 2 +/* regmap types */ +enum bcm590xx_regmap_type { + BCM590XX_REGMAP_PRI, + BCM590XX_REGMAP_SEC, +}; + /* max register address */ #define BCM590XX_MAX_REGISTER_PRI 0xe7 #define BCM590XX_MAX_REGISTER_SEC 0xf0 -- cgit v1.2.3 From 1007ae0d464ceb55a3740634790521d3543aaab9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 22 May 2025 12:47:31 +0200 Subject: spi: use container_of_cont() for to_spi_device() Some places in the spi core pass in a const pointer to a device and the default container_of() casts that away, which is not a good idea. Preserve the proper const attribute by using container_of_const() for to_spi_device() instead, which is what it was designed for. Note, this removes the NULL check for a device pointer in the call, but no one was ever checking for that return value, and a device pointer should never be NULL overall anyway, so this should be a safe change. Cc: Mark Brown Fixes: d69d80484598 ("driver core: have match() callback in struct bus_type take a const *") Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2025052230-fidgeting-stooge-66f5@gregkh Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 0ba5e49bace4..6e64f0193777 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -249,10 +249,7 @@ struct spi_device { static_assert((SPI_MODE_KERNEL_MASK & SPI_MODE_USER_MASK) == 0, "SPI_MODE_USER_MASK & SPI_MODE_KERNEL_MASK must not overlap"); -static inline struct spi_device *to_spi_device(const struct device *dev) -{ - return dev ? container_of(dev, struct spi_device, dev) : NULL; -} +#define to_spi_device(__dev) container_of_const(__dev, struct spi_device, dev) /* Most drivers won't need to care about device refcounting */ static inline struct spi_device *spi_dev_get(struct spi_device *spi) -- cgit v1.2.3 From 1c12fbdf40e17df2efc24bf2009a0c3bfa75bfa7 Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Thu, 22 May 2025 14:06:20 +0200 Subject: regmap: irq: Add support for chips without separate IRQ status Some GPIO chips allow to rise an IRQ on GPIO level changes but do not provide an IRQ status for each separate line: only the current gpio level can be retrieved. Add support for these chips, emulating IRQ status by comparing GPIO levels with the levels during the previous interrupt. Signed-off-by: Mathieu Dubois-Briand Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20250522-mdb-max7360-support-v9-5-74fc03517e41@bootlin.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index d17c5ea3d55d..02b83f5499b8 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1641,6 +1641,8 @@ struct regmap_irq_chip_data; * @ack_invert: Inverted ack register: cleared bits for ack. * @clear_ack: Use this to set 1 and 0 or vice-versa to clear interrupts. * @status_invert: Inverted status register: cleared bits are active interrupts. + * @status_is_level: Status register is actuall signal level: Xor status + * register with previous value to get active interrupts. * @wake_invert: Inverted wake register: cleared bits are wake enabled. * @type_in_mask: Use the mask registers for controlling irq type. Use this if * the hardware provides separate bits for rising/falling edge @@ -1704,6 +1706,7 @@ struct regmap_irq_chip { unsigned int ack_invert:1; unsigned int clear_ack:1; unsigned int status_invert:1; + unsigned int status_is_level:1; unsigned int wake_invert:1; unsigned int type_in_mask:1; unsigned int clear_on_unmask:1; -- cgit v1.2.3 From 4ff4d86f6cceb6bea583bdb230e5439655778cce Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 19 May 2025 10:45:05 +0200 Subject: net: Add support for providing the PTP hardware source in tsinfo Multi-PTP source support within a network topology has been merged, but the hardware timestamp source is not yet exposed to users. Currently, users only see the PTP index, which does not indicate whether the timestamp comes from a PHY or a MAC. Add support for reporting the hwtstamp source using a hwtstamp-source field, alongside hwtstamp-phyindex, to describe the origin of the hardware timestamp. Remove HWTSTAMP_SOURCE_UNSPEC enum value as it is not used at all. Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250519-feature_ptp_source-v4-1-5d10e19a0265@bootlin.com Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 5 +++++ include/linux/net_tstamp.h | 7 +------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 117718c24814..5e0dd333ad1f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #define ETHTOOL_MM_MAX_VERIFY_TIME_MS 128 @@ -830,6 +831,8 @@ struct ethtool_rxfh_param { * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags * @phc_index: device index of the associated PHC, or -1 if there is none * @phc_qualifier: qualifier of the associated PHC + * @phc_source: source device of the associated PHC + * @phc_phyindex: index of PHY device source of the associated PHC * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values */ @@ -838,6 +841,8 @@ struct kernel_ethtool_ts_info { u32 so_timestamping; int phc_index; enum hwtstamp_provider_qualifier phc_qualifier; + enum hwtstamp_source phc_source; + int phc_phyindex; enum hwtstamp_tx_types tx_types; enum hwtstamp_rx_filters rx_filters; }; diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index ff0758e88ea1..f4936d9c2b3c 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -4,6 +4,7 @@ #define _LINUX_NET_TIMESTAMPING_H_ #include +#include #define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ SOF_TIMESTAMPING_TX_SOFTWARE | \ @@ -13,12 +14,6 @@ SOF_TIMESTAMPING_TX_HARDWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) -enum hwtstamp_source { - HWTSTAMP_SOURCE_UNSPEC, - HWTSTAMP_SOURCE_NETDEV, - HWTSTAMP_SOURCE_PHYLIB, -}; - /** * struct hwtstamp_provider_desc - hwtstamp provider description * -- cgit v1.2.3 From 1d2aeee6dd629084cc524ec2d00100e70aa3c824 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 18 Mar 2025 10:13:41 +0100 Subject: mfd: aat2870: Use per-client debugfs directory The I2C core now provides a debugfs entry for each client. Let this driver use it instead of the custom directory in debugfs root. Further improvements by this change: automatic clean up on removal, support of multiple instances. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250318091426.22258-2-wsa+renesas@sang-engineering.com Signed-off-by: Lee Jones --- include/linux/mfd/aat2870.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/aat2870.h b/include/linux/mfd/aat2870.h index 2445842d482d..c7a3c53eba68 100644 --- a/include/linux/mfd/aat2870.h +++ b/include/linux/mfd/aat2870.h @@ -133,9 +133,6 @@ struct aat2870_data { int (*read)(struct aat2870_data *aat2870, u8 addr, u8 *val); int (*write)(struct aat2870_data *aat2870, u8 addr, u8 val); int (*update)(struct aat2870_data *aat2870, u8 addr, u8 mask, u8 val); - - /* for debugfs */ - struct dentry *dentry_root; }; struct aat2870_subdev_info { -- cgit v1.2.3 From 3a91f28fab43f093c72312148288d125ae160c2d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 22 May 2025 23:20:39 +0800 Subject: io_uring: add helper io_uring_cmd_ctx_handle() Add helper io_uring_cmd_ctx_handle() for driver to track per-context resource, such as registered kernel io buffer. Suggested-by: Caleb Sander Mateos Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250522152043.399824-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 0634a3de1782..53408124c1e5 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -140,6 +140,15 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur return cmd_to_io_kiocb(cmd)->async_data; } +/* + * Return uring_cmd's context reference as its context handle for driver to + * track per-context resource, such as registered kernel IO buffer + */ +static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->ctx; +} + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); -- cgit v1.2.3 From 9e8c67a9e526f497d606d721cf6b92dd68d90806 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2025 00:44:44 -0400 Subject: sched_ext: Introduce cgroup_lifetime_notifier Other subsystems may make use of the cgroup hierarchy with the cgroup_bpf support being one such example. For such a feature, it's useful to be able to hook into cgroup creation and destruction paths to perform feature-specific initializations and cleanups. Add cgroup_lifetime_notifier which generates CGROUP_LIFETIME_ONLINE and CGROUP_LIFETIME_OFFLINE events whenever cgroups are created and destroyed, respectively. The next patch will convert cgroup_bpf to use the new notifier and other uses are planned. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 989c08b09691..7865b681731c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -40,7 +41,7 @@ struct kernel_clone_args; #ifdef CONFIG_CGROUPS -enum { +enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ @@ -66,10 +67,16 @@ struct css_task_iter { struct list_head iters_node; /* css_set->task_iters */ }; +enum cgroup_lifetime_events { + CGROUP_LIFETIME_ONLINE, + CGROUP_LIFETIME_OFFLINE, +}; + extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; extern spinlock_t css_set_lock; +extern struct blocking_notifier_head cgroup_lifetime_notifier; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include -- cgit v1.2.3 From 82648b8b2ae0a0ff371e2a98133844658cfaae9a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2025 00:46:12 -0400 Subject: sched_ext: Convert cgroup BPF support to use cgroup_lifetime_notifier Replace explicit cgroup_bpf_inherit/offline() calls from cgroup creation/destruction paths with notification callback registered on cgroup_lifetime_notifier. Signed-off-by: Tejun Heo --- include/linux/bpf-cgroup.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 9de7adb68294..60d1511b4f4d 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -114,8 +114,7 @@ struct bpf_prog_list { u32 flags; }; -int cgroup_bpf_inherit(struct cgroup *cgrp); -void cgroup_bpf_offline(struct cgroup *cgrp); +void __init cgroup_bpf_lifetime_notifier_init(void); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -431,8 +430,10 @@ const struct bpf_func_proto * cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else -static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } -static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} +static inline void cgroup_bpf_lifetime_notifier_init(void) +{ + return; +} static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, -- cgit v1.2.3 From 3f12680913fda8de06c21e836dd5f246fe1684e5 Mon Sep 17 00:00:00 2001 From: Yuquan Wang Date: Thu, 8 May 2025 10:27:19 +0800 Subject: mm: numa_memblks: introduce numa_add_reserved_memblk acpi_parse_cfmws() currently adds empty CFMWS ranges to numa_meminfo with the expectation that numa_cleanup_meminfo moves them to numa_reserved_meminfo. There is no need for that indirection when it is known in advance that these unpopulated ranges are meant for numa_reserved_meminfo in support of future hotplug / CXL provisioning. Introduce and use numa_add_reserved_memblk() to add the empty CFMWS ranges directly. Link: https://lkml.kernel.org/r/20250508022719.3941335-1-wangyuquan1236@phytium.com.cn Signed-off-by: Yuquan Wang Reviewed-by: Alison Schofield Cc: Bruno Faccini Cc: Chen Baozi Cc: Dan Williams Cc: David Hildenbrand Cc: Haibo Xu Cc: Huacai Chen Cc: Joanthan Cameron Cc: Len Brown Cc: Mike Rapoport Cc: Robert Richter Signed-off-by: Andrew Morton --- include/linux/numa_memblks.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index dd85613cdd86..991076cba7c5 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -22,6 +22,7 @@ struct numa_meminfo { }; int __init numa_add_memblk(int nodeid, u64 start, u64 end); +int __init numa_add_reserved_memblk(int nid, u64 start, u64 end); void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); int __init numa_cleanup_meminfo(struct numa_meminfo *mi); -- cgit v1.2.3 From e1e1a3ae7f9f0cb06e80af0f24927be63149d081 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 12 May 2025 14:34:15 +0200 Subject: mm: convert track_pfn_insert() to pfnmap_setup_cachemode*() ... by factoring it out from track_pfn_remap() into pfnmap_setup_cachemode() and provide pfnmap_setup_cachemode_pfn() as a replacement for track_pfn_insert(). For PMDs/PUDs, we keep checking a single pfn only. Add some documentation, and also document why it is valid to not check the whole pfn range. We'll reuse pfnmap_setup_cachemode() from core MM next. Link: https://lkml.kernel.org/r/20250512123424.637989-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Ingo Molnar [x86 bits] Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Airlie Cc: "H. Peter Anvin" Cc: Jani Nikula Cc: Jann Horn Cc: Jonas Lahtinen Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Peter Xu Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Tvrtko Ursulin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 52 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f1e890b60460..be1745839871 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1496,13 +1496,10 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return 0; } -/* - * track_pfn_insert is called when a _new_ single pfn is established - * by vmf_insert_pfn(). - */ -static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - pfn_t pfn) +static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, + pgprot_t *prot) { + return 0; } /* @@ -1552,8 +1549,32 @@ static inline void untrack_pfn_clear(struct vm_area_struct *vma) extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size); -extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - pfn_t pfn); + +/** + * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range + * @pfn: the start of the pfn range + * @size: the size of the pfn range in bytes + * @prot: the pgprot to modify + * + * Lookup the cachemode for the pfn range starting at @pfn with the size + * @size and store it in @prot, leaving other data in @prot unchanged. + * + * This allows for a hardware implementation to have fine-grained control of + * memory cache behavior at page level granularity. Without a hardware + * implementation, this function does nothing. + * + * Currently there is only one implementation for this - x86 Page Attribute + * Table (PAT). See Documentation/arch/x86/pat.rst for more details. + * + * This function can fail if the pfn range spans pfns that require differing + * cachemodes. If the pfn range was previously verified to have a single + * cachemode, it is sufficient to query only a single pfn. The assumption is + * that this is the case for drivers using the vmf_insert_pfn*() interface. + * + * Returns 0 on success and -EINVAL on error. + */ +int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, + pgprot_t *prot); extern int track_pfn_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long *pfn); extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, @@ -1563,6 +1584,21 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, extern void untrack_pfn_clear(struct vm_area_struct *vma); #endif +/** + * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn + * @pfn: the pfn + * @prot: the pgprot to modify + * + * Lookup the cachemode for @pfn and store it in @prot, leaving other + * data in @prot unchanged. + * + * See pfnmap_setup_cachemode() for details. + */ +static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) +{ + pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); +} + #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) -- cgit v1.2.3 From db44863a4d9df3604c4ff76507bb2056b6392e58 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 12 May 2025 14:34:16 +0200 Subject: mm: introduce pfnmap_track() and pfnmap_untrack() and use them for memremap Let's provide variants of track_pfn_remap() and untrack_pfn() that won't mess with VMAs, and replace the usage in mm/memremap.c. Add some documentation. Link: https://lkml.kernel.org/r/20250512123424.637989-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Acked-by: Ingo Molnar [x86 bits] Reviewed-by: Liam R. Howlett Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Airlie Cc: "H. Peter Anvin" Cc: Jani Nikula Cc: Jann Horn Cc: Jonas Lahtinen Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Peter Xu Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Tvrtko Ursulin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index be1745839871..90f72cd35839 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1502,6 +1502,16 @@ static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, return 0; } +static inline int pfnmap_track(unsigned long pfn, unsigned long size, + pgprot_t *prot) +{ + return 0; +} + +static inline void pfnmap_untrack(unsigned long pfn, unsigned long size) +{ +} + /* * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page * tables copied during copy_page_range(). Will store the pfn to be @@ -1575,6 +1585,35 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, */ int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot); + +/** + * pfnmap_track - track a pfn range + * @pfn: the start of the pfn range + * @size: the size of the pfn range in bytes + * @prot: the pgprot to track + * + * Requested the pfn range to be 'tracked' by a hardware implementation and + * setup the cachemode in @prot similar to pfnmap_setup_cachemode(). + * + * This allows for fine-grained control of memory cache behaviour at page + * level granularity. Tracking memory this way is persisted across VMA splits + * (VMA merging does not apply for VM_PFNMAP). + * + * Currently, there is only one implementation for this - x86 Page Attribute + * Table (PAT). See Documentation/arch/x86/pat.rst for more details. + * + * Returns 0 on success and -EINVAL on error. + */ +int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot); + +/** + * pfnmap_untrack - untrack a pfn range + * @pfn: the start of the pfn range + * @size: the size of the pfn range in bytes + * + * Untrack a pfn range previously tracked through pfnmap_track(). + */ +void pfnmap_untrack(unsigned long pfn, unsigned long size); extern int track_pfn_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long *pfn); extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, -- cgit v1.2.3 From f8e97613fed25758ddf52159b87e1c66e619a23a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 12 May 2025 14:34:17 +0200 Subject: mm: convert VM_PFNMAP tracking to pfnmap_track() + pfnmap_untrack() Let's use our new interface. In remap_pfn_range(), we'll now decide whether we have to track (full VMA covered) or only lookup the cachemode (partial VMA covered). Remember what we have to untrack by linking it from the VMA. When duplicating VMAs (e.g., splitting, mremap, fork), we'll handle it similar to anon VMA names, and use a kref to share the tracking. Once the last VMA un-refs our tracking data, we'll do the untracking, which simplifies things a lot and should sort our various issues we saw recently, for example, when partially unmapping/zapping a tracked VMA. This change implies that we'll keep tracking the original PFN range even after splitting + partially unmapping it: not too bad, because it was not working reliably before. The only thing that kind-of worked before was shrinking such a mapping using mremap(): we managed to adjust the reservation in a hacky way, now we won't adjust the reservation but leave it around until all involved VMAs are gone. If that ever turns out to be an issue, we could hook into VM splitting code and split the tracking; however, that adds complexity that might not be required, so we'll keep it simple for now. Link: https://lkml.kernel.org/r/20250512123424.637989-5-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Ingo Molnar [x86 bits] Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Airlie Cc: "H. Peter Anvin" Cc: Jani Nikula Cc: Jann Horn Cc: Jonas Lahtinen Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Peter Xu Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Tvrtko Ursulin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 ++ include/linux/mm_types.h | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f9157a0c42a5..89b518ff097e 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -447,6 +447,8 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, #endif /* CONFIG_ANON_VMA_NAME */ +void pfnmap_track_ctx_release(struct kref *ref); + static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 15808cad2bc1..3e934dc6057c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -763,6 +763,14 @@ struct vma_numab_state { int prev_scan_seq; }; +#ifdef __HAVE_PFNMAP_TRACKING +struct pfnmap_track_ctx { + struct kref kref; + unsigned long pfn; + unsigned long size; /* in bytes */ +}; +#endif + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -900,6 +908,9 @@ struct vm_area_struct { struct anon_vma_name *anon_name; #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef __HAVE_PFNMAP_TRACKING + struct pfnmap_track_ctx *pfnmap_track_ctx; +#endif } __randomize_layout; #ifdef CONFIG_NUMA -- cgit v1.2.3 From 7bd7d74ec01954fde9eb65b065eb55bcda4f86e2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 12 May 2025 14:34:18 +0200 Subject: x86/mm/pat: remove old pfnmap tracking interface We can now get rid of the old interface along with get_pat_info() and follow_phys(). Link: https://lkml.kernel.org/r/20250512123424.637989-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Acked-by: Ingo Molnar [x86 bits] Reviewed-by: Liam R. Howlett Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Airlie Cc: "H. Peter Anvin" Cc: Jani Nikula Cc: Jann Horn Cc: Jonas Lahtinen Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Peter Xu Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Tvrtko Ursulin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 66 ------------------------------------------------- 1 file changed, 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 90f72cd35839..0b6e1f781d86 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1485,17 +1485,6 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) * vmf_insert_pfn. */ -/* - * track_pfn_remap is called when a _new_ pfn mapping is being established - * by remap_pfn_range() for physical range indicated by pfn and size. - */ -static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long addr, - unsigned long size) -{ - return 0; -} - static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot) { @@ -1511,55 +1500,7 @@ static inline int pfnmap_track(unsigned long pfn, unsigned long size, static inline void pfnmap_untrack(unsigned long pfn, unsigned long size) { } - -/* - * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page - * tables copied during copy_page_range(). Will store the pfn to be - * passed to untrack_pfn_copy() only if there is something to be untracked. - * Callers should initialize the pfn to 0. - */ -static inline int track_pfn_copy(struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, unsigned long *pfn) -{ - return 0; -} - -/* - * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during - * copy_page_range(), but after track_pfn_copy() was already called. Can - * be called even if track_pfn_copy() did not actually track anything: - * handled internally. - */ -static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, - unsigned long pfn) -{ -} - -/* - * untrack_pfn is called while unmapping a pfnmap for a region. - * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case pfn, size are zero). - */ -static inline void untrack_pfn(struct vm_area_struct *vma, - unsigned long pfn, unsigned long size, - bool mm_wr_locked) -{ -} - -/* - * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA: - * - * 1) During mremap() on the src VMA after the page tables were moved. - * 2) During fork() on the dst VMA, immediately after duplicating the src VMA. - */ -static inline void untrack_pfn_clear(struct vm_area_struct *vma) -{ -} #else -extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long addr, - unsigned long size); - /** * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range * @pfn: the start of the pfn range @@ -1614,13 +1555,6 @@ int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot); * Untrack a pfn range previously tracked through pfnmap_track(). */ void pfnmap_untrack(unsigned long pfn, unsigned long size); -extern int track_pfn_copy(struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, unsigned long *pfn); -extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, - unsigned long pfn); -extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size, bool mm_wr_locked); -extern void untrack_pfn_clear(struct vm_area_struct *vma); #endif /** -- cgit v1.2.3 From cba4dbeb7bfcf8b69d491288c3bf877ead883214 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 12 May 2025 14:34:19 +0200 Subject: mm: remove VM_PAT It's unused, so let's remove it. Link: https://lkml.kernel.org/r/20250512123424.637989-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Acked-by: Ingo Molnar [x86 bits] Reviewed-by: Liam R. Howlett Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Airlie Cc: "H. Peter Anvin" Cc: Jani Nikula Cc: Jann Horn Cc: Jonas Lahtinen Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Peter Xu Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Steven Rostedt Cc: Thomas Gleinxer Cc: Tvrtko Ursulin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 43748c8f3454..a916ea42cfd5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -357,9 +357,7 @@ extern unsigned int kobjsize(const void *objp); # define VM_SHADOW_STACK VM_NONE #endif -#if defined(CONFIG_X86) -# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ -#elif defined(CONFIG_PPC64) +#if defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 -- cgit v1.2.3 From 5053383829ab2b17dc4766a832004c848f4af9df Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 May 2025 10:57:11 +0800 Subject: mm: khugepaged: convert set_huge_pmd() to take a folio We've already gotten the stable locked folio in collapse_pte_mapped_thp(), so just use folio for set_huge_pmd() to set the PMD entry, which is more straightforward. Moreover, we will check the folio size in do_set_pmd(), so we can remove the unnecessary VM_BUG_ON() in set_huge_pmd(). While we are at it, we can also remove the PageTransHuge(), as it currently has no callers. Link: https://lkml.kernel.org/r/110c3e1ec5fe7854a0e2c95ffcbc985817180ed7.1747017104.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Dev Jain Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 37b11f15dbd9..1c1d49554c71 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -907,20 +907,6 @@ FOLIO_FLAG_FALSE(partially_mapped) #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* - * PageHuge() only returns true for hugetlbfs pages, but not for - * normal or transparent huge pages. - * - * PageTransHuge() returns true for both transparent huge and - * hugetlbfs pages, but not normal pages. PageTransHuge() can only be - * called only in the core VM paths where hugetlbfs pages can't exist. - */ -static inline int PageTransHuge(const struct page *page) -{ - VM_BUG_ON_PAGE(PageTail(page), page); - return PageHead(page); -} - /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known @@ -931,7 +917,6 @@ static inline int PageTransCompound(const struct page *page) return PageCompound(page); } #else -TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) #endif -- cgit v1.2.3 From 698c0089cdf0b27d37f0b24824b682c23de5f72d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 May 2025 10:57:12 +0800 Subject: mm: convert do_set_pmd() to take a folio In do_set_pmd(), we always use the folio->page to build PMD mappings for the entire folio. Since all callers of do_set_pmd() already hold a stable folio, converting do_set_pmd() to take a folio is safe and more straightforward. In addition, to ensure the extensibility of do_set_pmd() for supporting larger folios beyond PMD size, we keep the 'page' parameter to specify which page within the folio should be mapped. No functional changes expected. Link: https://lkml.kernel.org/r/9b488f4ecb4d3fd8634e3d448dd0ed6964482480.1747017104.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Zi Yan Acked-by: David Hildenbrand Cc: Dev Jain Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a916ea42cfd5..cd2e513189d6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1235,7 +1235,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page); void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr); -- cgit v1.2.3 From 6669d1aaa0c45a50a4cc5f1756ab03578eaebd18 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 May 2025 09:40:24 +0100 Subject: mm: remove WARN_ON_ONCE() in file_has_valid_mmap_hooks() Having encountered a trinity report in linux-next (Linked in the 'Closes' tag) it appears that there are legitimate situations where a file-backed mapping can be acquired but no file->f_op->mmap or file->f_op->mmap_prepare is set, at which point do_mmap() should simply error out with -ENODEV. Since previously we did not warn in this scenario and it appears we rely upon this, restore this situation, while retaining a WARN_ON_ONCE() for the case where both are set, which is absolutely incorrect and must be addressed and thus always requires a warning. If further work is required to chase down precisely what is causing this, then we can later restore this, but it makes no sense to hold up this series to do so, as this is existing and apparently expected behaviour. Link: https://lkml.kernel.org/r/20250514084024.29148-1-lorenzo.stoakes@oracle.com Fixes: c84bf6dd2b83 ("mm: introduce new .mmap_prepare() file callback") Signed-off-by: Lorenzo Stoakes Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202505141434.96ce5e5d-lkp@intel.com Reviewed-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: David Hildenbrand Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jann Horn Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e2721a1ff13d..09c8495dacdb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2248,7 +2248,7 @@ static inline bool file_has_valid_mmap_hooks(struct file *file) /* Hooks are mutually exclusive. */ if (WARN_ON_ONCE(has_mmap && has_mmap_prepare)) return false; - if (WARN_ON_ONCE(!has_mmap && !has_mmap_prepare)) + if (!has_mmap && !has_mmap_prepare) return false; return true; -- cgit v1.2.3 From 2aad4edf6e1018b28b7000faec56b7b6e585c8e1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 16 May 2025 17:34:46 -0700 Subject: mm: rename try_alloc_pages() to alloc_pages_nolock() The "try_" prefix is confusing, since it made people believe that try_alloc_pages() is analogous to spin_trylock() and NULL return means EAGAIN. This is not the case. If it returns NULL there is no reason to call it again. It will most likely return NULL again. Hence rename it to alloc_pages_nolock() to make it symmetrical to free_pages_nolock() and document that NULL means ENOMEM. Link: https://lkml.kernel.org/r/20250517003446.60260-1-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Harry Yoo Cc: Andrii Nakryiko Cc: Kumar Kartikeya Dwivedi Cc: Michal Hocko Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c9fa6309c903..be160e8d8bcb 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -45,13 +45,13 @@ static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. * All GFP_* flags including GFP_NOWAIT use one or both flags. - * try_alloc_pages() is the only API that doesn't specify either flag. + * alloc_pages_nolock() is the only API that doesn't specify either flag. * * This is stronger than GFP_NOWAIT or GFP_ATOMIC because * those are guaranteed to never block on a sleeping lock. * Here we are enforcing that the allocation doesn't ever spin * on any locks (i.e. only trylocks). There is no high level - * GFP_$FOO flag for this use in try_alloc_pages() as the + * GFP_$FOO flag for this use in alloc_pages_nolock() as the * regular page allocator doesn't fully support this * allocation mode. */ @@ -354,8 +354,8 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) -struct page *try_alloc_pages_noprof(int nid, unsigned int order); -#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__)) +struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); +#define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) -- cgit v1.2.3 From cc79061b8fc119807111b615aa791562374b15b2 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 13 May 2025 14:56:35 +0800 Subject: mm: khugepaged: decouple SHMEM and file folios' collapse Originally, the file pages collapse was intended for tmpfs/shmem to merge into THP in the background. However, now not only tmpfs/shmem can support large folios, but some other file systems (such as XFS, erofs ...) also support large folios. Therefore, it is time to decouple the support of file folios collapse from SHMEM. Link: https://lkml.kernel.org/r/ce5c2314e0368cf34bda26f9bacf01c982d4da17.1747119309.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Acked-by: Zi Yan Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 1f46046080f5..b8d69cfbb58b 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,16 +15,8 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); -#ifdef CONFIG_SHMEM extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd); -#else -static inline int collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr, bool install_pmd) -{ - return 0; -} -#endif static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { -- cgit v1.2.3 From 8814e3b8692b31e0150a894dd70c14ca0b7b746a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 May 2025 11:41:54 -0700 Subject: memcg: make mod_memcg_state re-entrant safe against irqs Let's make mod_memcg_state re-entrant safe against irqs. The only thing needed is to convert the usage of __this_cpu_add() to this_cpu_add(). In addition, with re-entrant safety, there is no need to disable irqs. mod_memcg_state() is not safe against nmi, so let's add warning if someone tries to call it in nmi context. Link: https://lkml.kernel.org/r/20250514184158.3471331-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9ed75f82b858..92861ff3c43f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -903,19 +903,9 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); -void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, - int val); - /* idx can be of type enum memcg_stat_item or node_stat_item */ -static inline void mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_memcg_state(memcg, idx, val); - local_irq_restore(flags); -} +void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, int val); static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) @@ -1375,12 +1365,6 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } -static inline void __mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, - int nr) -{ -} - static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) -- cgit v1.2.3 From e52401e7247bb36cabba389ae32fb75a12a6e94b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 May 2025 11:41:55 -0700 Subject: memcg: make count_memcg_events re-entrant safe against irqs Let's make count_memcg_events re-entrant safe against irqs. The only thing needed is to convert the usage of __this_cpu_add() to this_cpu_add(). In addition, with re-entrant safety, there is no need to disable irqs. Also add warnings for in_nmi() as it is not safe against nmi context. Link: https://lkml.kernel.org/r/20250514184158.3471331-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 92861ff3c43f..f7848f73f41c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -942,19 +942,8 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, local_irq_restore(flags); } -void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, - unsigned long count); - -static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) -{ - unsigned long flags; - - local_irq_save(flags); - __count_memcg_events(memcg, idx, count); - local_irq_restore(flags); -} +void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count); static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) @@ -1418,12 +1407,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, } static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) -{ -} - -static inline void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { -- cgit v1.2.3 From ba1a455393c430c97235cb593d40a3ea7ad8acca Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Sun, 16 Mar 2025 19:18:52 +0100 Subject: mfd: bcm590xx: Drop unused "id" member of bcm590xx struct The "id" member of the bcm590xx struct is unused and will be confusing once we add an actual PMU ID storage value. Drop it; a replacement will be introduced in a future commit. Signed-off-by: Artur Weber Reviewed-by: "Rob Herring (Arm)" Link: https://lore.kernel.org/r/20250316-bcm59054-v7-4-4281126be1b8@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/bcm590xx.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/bcm590xx.h b/include/linux/mfd/bcm590xx.h index 09d4ae054242..5a5783abd47b 100644 --- a/include/linux/mfd/bcm590xx.h +++ b/include/linux/mfd/bcm590xx.h @@ -43,7 +43,6 @@ struct bcm590xx { struct i2c_client *i2c_sec; struct regmap *regmap_pri; struct regmap *regmap_sec; - unsigned int id; /* PMU ID value; also used as device type */ u8 pmu_id; -- cgit v1.2.3 From b3379422b460ea8c0556df300d547d63e5569cda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 9 Apr 2025 21:37:25 +0100 Subject: mfd: sec-core: Drop non-existing forward declarations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sec_irq_resume() was removed in commit 6445b84abf91 ("mfd: Add s2mps11 irq driver") and sec_irq_exit() in commit 3dc6f4aaafbe ("mfd: sec: Use devm_mfd_add_devices and devm_regmap_add_irq_chip") while the prototypes were left. They should be removed. Do so. Reviewed-by: Krzysztof Kozlowski Signed-off-by: André Draszik Link: https://lore.kernel.org/r/20250409-s2mpg10-v4-4-d66d5f39b6bf@linaro.org Signed-off-by: Lee Jones --- include/linux/mfd/samsung/core.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h index f35314458fd2..b7008b50392a 100644 --- a/include/linux/mfd/samsung/core.h +++ b/include/linux/mfd/samsung/core.h @@ -72,8 +72,6 @@ struct sec_pmic_dev { }; int sec_irq_init(struct sec_pmic_dev *sec_pmic); -void sec_irq_exit(struct sec_pmic_dev *sec_pmic); -int sec_irq_resume(struct sec_pmic_dev *sec_pmic); struct sec_platform_data { struct sec_regulator_data *regulators; -- cgit v1.2.3 From 8b88b5e4d58151d8a37250afc978f253cd8cc4fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 9 Apr 2025 21:37:28 +0100 Subject: mfd: sec: Move private internal API to internal header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sec_irq_init() is an internal API for the core driver, and doesn't belong into the public header. Due to an upcoming split of the driver into a core and i2c driver, we'll also be adding more internal APIs, which again shouldn't be in the public header. Move it into a new internal include. Reviewed-by: Krzysztof Kozlowski Signed-off-by: André Draszik Link: https://lore.kernel.org/r/20250409-s2mpg10-v4-7-d66d5f39b6bf@linaro.org Signed-off-by: Lee Jones --- include/linux/mfd/samsung/core.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h index b7008b50392a..8a4e660854bb 100644 --- a/include/linux/mfd/samsung/core.h +++ b/include/linux/mfd/samsung/core.h @@ -71,8 +71,6 @@ struct sec_pmic_dev { struct regmap_irq_chip_data *irq_data; }; -int sec_irq_init(struct sec_pmic_dev *sec_pmic); - struct sec_platform_data { struct sec_regulator_data *regulators; struct sec_opmode_data *opmode; -- cgit v1.2.3 From 5338709089b75f57a9b3b7b17a0424ecab5f2fd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 9 Apr 2025 21:37:30 +0100 Subject: mfd: sec: Add support for S2MPG10 PMIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Samsung's S2MPG10 PMIC, which is a Power Management IC for mobile applications with buck converters, various LDOs, power meters, RTC, clock outputs, and additional GPIOs interfaces. Contrary to existing Samsung S2M series PMICs supported, communication is not via I2C, but via the Samsung ACPM firmware. This commit adds the core driver. Signed-off-by: André Draszik Link: https://lore.kernel.org/r/20250409-s2mpg10-v4-9-d66d5f39b6bf@linaro.org Signed-off-by: Lee Jones --- include/linux/mfd/samsung/core.h | 1 + include/linux/mfd/samsung/irq.h | 103 ++++++++ include/linux/mfd/samsung/rtc.h | 37 +++ include/linux/mfd/samsung/s2mpg10.h | 454 ++++++++++++++++++++++++++++++++++++ 4 files changed, 595 insertions(+) create mode 100644 include/linux/mfd/samsung/s2mpg10.h (limited to 'include/linux') diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h index 8a4e660854bb..c1102324172a 100644 --- a/include/linux/mfd/samsung/core.h +++ b/include/linux/mfd/samsung/core.h @@ -39,6 +39,7 @@ enum sec_device_type { S5M8767X, S2DOS05, S2MPA01, + S2MPG10, S2MPS11X, S2MPS13X, S2MPS14X, diff --git a/include/linux/mfd/samsung/irq.h b/include/linux/mfd/samsung/irq.h index 978f7af66f74..b4805cbd949b 100644 --- a/include/linux/mfd/samsung/irq.h +++ b/include/linux/mfd/samsung/irq.h @@ -57,6 +57,109 @@ enum s2mpa01_irq { #define S2MPA01_IRQ_B24_TSD_MASK (1 << 4) #define S2MPA01_IRQ_B35_TSD_MASK (1 << 5) +enum s2mpg10_irq { + /* PMIC */ + S2MPG10_IRQ_PWRONF, + S2MPG10_IRQ_PWRONR, + S2MPG10_IRQ_JIGONBF, + S2MPG10_IRQ_JIGONBR, + S2MPG10_IRQ_ACOKBF, + S2MPG10_IRQ_ACOKBR, + S2MPG10_IRQ_PWRON1S, + S2MPG10_IRQ_MRB, +#define S2MPG10_IRQ_PWRONF_MASK BIT(0) +#define S2MPG10_IRQ_PWRONR_MASK BIT(1) +#define S2MPG10_IRQ_JIGONBF_MASK BIT(2) +#define S2MPG10_IRQ_JIGONBR_MASK BIT(3) +#define S2MPG10_IRQ_ACOKBF_MASK BIT(4) +#define S2MPG10_IRQ_ACOKBR_MASK BIT(5) +#define S2MPG10_IRQ_PWRON1S_MASK BIT(6) +#define S2MPG10_IRQ_MRB_MASK BIT(7) + + S2MPG10_IRQ_RTC60S, + S2MPG10_IRQ_RTCA1, + S2MPG10_IRQ_RTCA0, + S2MPG10_IRQ_RTC1S, + S2MPG10_IRQ_WTSR_COLDRST, + S2MPG10_IRQ_WTSR, + S2MPG10_IRQ_WRST, + S2MPG10_IRQ_SMPL, +#define S2MPG10_IRQ_RTC60S_MASK BIT(0) +#define S2MPG10_IRQ_RTCA1_MASK BIT(1) +#define S2MPG10_IRQ_RTCA0_MASK BIT(2) +#define S2MPG10_IRQ_RTC1S_MASK BIT(3) +#define S2MPG10_IRQ_WTSR_COLDRST_MASK BIT(4) +#define S2MPG10_IRQ_WTSR_MASK BIT(5) +#define S2MPG10_IRQ_WRST_MASK BIT(6) +#define S2MPG10_IRQ_SMPL_MASK BIT(7) + + S2MPG10_IRQ_120C, + S2MPG10_IRQ_140C, + S2MPG10_IRQ_TSD, + S2MPG10_IRQ_PIF_TIMEOUT1, + S2MPG10_IRQ_PIF_TIMEOUT2, + S2MPG10_IRQ_SPD_PARITY_ERR, + S2MPG10_IRQ_SPD_ABNORMAL_STOP, + S2MPG10_IRQ_PMETER_OVERF, +#define S2MPG10_IRQ_INT120C_MASK BIT(0) +#define S2MPG10_IRQ_INT140C_MASK BIT(1) +#define S2MPG10_IRQ_TSD_MASK BIT(2) +#define S2MPG10_IRQ_PIF_TIMEOUT1_MASK BIT(3) +#define S2MPG10_IRQ_PIF_TIMEOUT2_MASK BIT(4) +#define S2MPG10_IRQ_SPD_PARITY_ERR_MASK BIT(5) +#define S2MPG10_IRQ_SPD_ABNORMAL_STOP_MASK BIT(6) +#define S2MPG10_IRQ_PMETER_OVERF_MASK BIT(7) + + S2MPG10_IRQ_OCP_B1M, + S2MPG10_IRQ_OCP_B2M, + S2MPG10_IRQ_OCP_B3M, + S2MPG10_IRQ_OCP_B4M, + S2MPG10_IRQ_OCP_B5M, + S2MPG10_IRQ_OCP_B6M, + S2MPG10_IRQ_OCP_B7M, + S2MPG10_IRQ_OCP_B8M, +#define S2MPG10_IRQ_OCP_B1M_MASK BIT(0) +#define S2MPG10_IRQ_OCP_B2M_MASK BIT(1) +#define S2MPG10_IRQ_OCP_B3M_MASK BIT(2) +#define S2MPG10_IRQ_OCP_B4M_MASK BIT(3) +#define S2MPG10_IRQ_OCP_B5M_MASK BIT(4) +#define S2MPG10_IRQ_OCP_B6M_MASK BIT(5) +#define S2MPG10_IRQ_OCP_B7M_MASK BIT(6) +#define S2MPG10_IRQ_OCP_B8M_MASK BIT(7) + + S2MPG10_IRQ_OCP_B9M, + S2MPG10_IRQ_OCP_B10M, + S2MPG10_IRQ_WLWP_ACC, + S2MPG10_IRQ_SMPL_TIMEOUT, + S2MPG10_IRQ_WTSR_TIMEOUT, + S2MPG10_IRQ_SPD_SRP_PKT_RST, +#define S2MPG10_IRQ_OCP_B9M_MASK BIT(0) +#define S2MPG10_IRQ_OCP_B10M_MASK BIT(1) +#define S2MPG10_IRQ_WLWP_ACC_MASK BIT(2) +#define S2MPG10_IRQ_SMPL_TIMEOUT_MASK BIT(5) +#define S2MPG10_IRQ_WTSR_TIMEOUT_MASK BIT(6) +#define S2MPG10_IRQ_SPD_SRP_PKT_RST_MASK BIT(7) + + S2MPG10_IRQ_PWR_WARN_CH0, + S2MPG10_IRQ_PWR_WARN_CH1, + S2MPG10_IRQ_PWR_WARN_CH2, + S2MPG10_IRQ_PWR_WARN_CH3, + S2MPG10_IRQ_PWR_WARN_CH4, + S2MPG10_IRQ_PWR_WARN_CH5, + S2MPG10_IRQ_PWR_WARN_CH6, + S2MPG10_IRQ_PWR_WARN_CH7, +#define S2MPG10_IRQ_PWR_WARN_CH0_MASK BIT(0) +#define S2MPG10_IRQ_PWR_WARN_CH1_MASK BIT(1) +#define S2MPG10_IRQ_PWR_WARN_CH2_MASK BIT(2) +#define S2MPG10_IRQ_PWR_WARN_CH3_MASK BIT(3) +#define S2MPG10_IRQ_PWR_WARN_CH4_MASK BIT(4) +#define S2MPG10_IRQ_PWR_WARN_CH5_MASK BIT(5) +#define S2MPG10_IRQ_PWR_WARN_CH6_MASK BIT(6) +#define S2MPG10_IRQ_PWR_WARN_CH7_MASK BIT(7) + + S2MPG10_IRQ_NR, +}; + enum s2mps11_irq { S2MPS11_IRQ_PWRONF, S2MPS11_IRQ_PWRONR, diff --git a/include/linux/mfd/samsung/rtc.h b/include/linux/mfd/samsung/rtc.h index 0204decfc9aa..51c4239a1fa6 100644 --- a/include/linux/mfd/samsung/rtc.h +++ b/include/linux/mfd/samsung/rtc.h @@ -72,6 +72,37 @@ enum s2mps_rtc_reg { S2MPS_RTC_REG_MAX, }; +enum s2mpg10_rtc_reg { + S2MPG10_RTC_CTRL, + S2MPG10_RTC_UPDATE, + S2MPG10_RTC_SMPL, + S2MPG10_RTC_WTSR, + S2MPG10_RTC_CAP_SEL, + S2MPG10_RTC_MSEC, + S2MPG10_RTC_SEC, + S2MPG10_RTC_MIN, + S2MPG10_RTC_HOUR, + S2MPG10_RTC_WEEK, + S2MPG10_RTC_DAY, + S2MPG10_RTC_MON, + S2MPG10_RTC_YEAR, + S2MPG10_RTC_A0SEC, + S2MPG10_RTC_A0MIN, + S2MPG10_RTC_A0HOUR, + S2MPG10_RTC_A0WEEK, + S2MPG10_RTC_A0DAY, + S2MPG10_RTC_A0MON, + S2MPG10_RTC_A0YEAR, + S2MPG10_RTC_A1SEC, + S2MPG10_RTC_A1MIN, + S2MPG10_RTC_A1HOUR, + S2MPG10_RTC_A1WEEK, + S2MPG10_RTC_A1DAY, + S2MPG10_RTC_A1MON, + S2MPG10_RTC_A1YEAR, + S2MPG10_RTC_OSC_CTRL, +}; + #define RTC_I2C_ADDR (0x0C >> 1) #define HOUR_12 (1 << 7) @@ -124,10 +155,16 @@ enum s2mps_rtc_reg { #define ALARM_ENABLE_SHIFT 7 #define ALARM_ENABLE_MASK (1 << ALARM_ENABLE_SHIFT) +/* WTSR & SMPL registers */ #define SMPL_ENABLE_SHIFT 7 #define SMPL_ENABLE_MASK (1 << SMPL_ENABLE_SHIFT) #define WTSR_ENABLE_SHIFT 6 #define WTSR_ENABLE_MASK (1 << WTSR_ENABLE_SHIFT) +#define S2MPG10_WTSR_COLDTIMER GENMASK(6, 5) +#define S2MPG10_WTSR_COLDRST BIT(4) +#define S2MPG10_WTSR_WTSRT GENMASK(3, 1) +#define S2MPG10_WTSR_WTSR_EN BIT(0) + #endif /* __LINUX_MFD_SEC_RTC_H */ diff --git a/include/linux/mfd/samsung/s2mpg10.h b/include/linux/mfd/samsung/s2mpg10.h new file mode 100644 index 000000000000..9f5919b89a3c --- /dev/null +++ b/include/linux/mfd/samsung/s2mpg10.h @@ -0,0 +1,454 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright 2015 Samsung Electronics + * Copyright 2020 Google Inc + * Copyright 2025 Linaro Ltd. + */ + +#ifndef __LINUX_MFD_S2MPG10_H +#define __LINUX_MFD_S2MPG10_H + +/* Common registers (type 0x000) */ +enum s2mpg10_common_reg { + S2MPG10_COMMON_CHIPID, + S2MPG10_COMMON_INT, + S2MPG10_COMMON_INT_MASK, + S2MPG10_COMMON_SPD_CTRL1 = 0x0a, + S2MPG10_COMMON_SPD_CTRL2, + S2MPG10_COMMON_SPD_CTRL3, + S2MPG10_COMMON_MON1SEL = 0x1a, + S2MPG10_COMMON_MON2SEL, + S2MPG10_COMMON_MONR, + S2MPG10_COMMON_DEBUG_CTRL1, + S2MPG10_COMMON_DEBUG_CTRL2, + S2MPG10_COMMON_DEBUG_CTRL3, + S2MPG10_COMMON_DEBUG_CTRL4, + S2MPG10_COMMON_DEBUG_CTRL5, + S2MPG10_COMMON_DEBUG_CTRL6, + S2MPG10_COMMON_DEBUG_CTRL7, + S2MPG10_COMMON_DEBUG_CTRL8, + S2MPG10_COMMON_TEST_MODE1, + S2MPG10_COMMON_TEST_MODE2, + S2MPG10_COMMON_SPD_DEBUG1, + S2MPG10_COMMON_SPD_DEBUG2, + S2MPG10_COMMON_SPD_DEBUG3, + S2MPG10_COMMON_SPD_DEBUG4, +}; + +/* For S2MPG10_COMMON_INT and S2MPG10_COMMON_INT_MASK */ +#define S2MPG10_COMMON_INT_SRC GENMASK(7, 0) +#define S2MPG10_COMMON_INT_SRC_PMIC BIT(0) + +/* PMIC registers (type 0x100) */ +enum s2mpg10_pmic_reg { + S2MPG10_PMIC_INT1, + S2MPG10_PMIC_INT2, + S2MPG10_PMIC_INT3, + S2MPG10_PMIC_INT4, + S2MPG10_PMIC_INT5, + S2MPG10_PMIC_INT6, + S2MPG10_PMIC_INT1M, + S2MPG10_PMIC_INT2M, + S2MPG10_PMIC_INT3M, + S2MPG10_PMIC_INT4M, + S2MPG10_PMIC_INT5M, + S2MPG10_PMIC_INT6M, + S2MPG10_PMIC_STATUS1, + S2MPG10_PMIC_STATUS2, + S2MPG10_PMIC_PWRONSRC, + S2MPG10_PMIC_OFFSRC, + S2MPG10_PMIC_BU_CHG, + S2MPG10_PMIC_RTCBUF, + S2MPG10_PMIC_COMMON_CTRL1, + S2MPG10_PMIC_COMMON_CTRL2, + S2MPG10_PMIC_COMMON_CTRL3, + S2MPG10_PMIC_COMMON_CTRL4, + S2MPG10_PMIC_SMPL_WARN_CTRL, + S2MPG10_PMIC_MIMICKING_CTRL, + S2MPG10_PMIC_B1M_CTRL, + S2MPG10_PMIC_B1M_OUT1, + S2MPG10_PMIC_B1M_OUT2, + S2MPG10_PMIC_B2M_CTRL, + S2MPG10_PMIC_B2M_OUT1, + S2MPG10_PMIC_B2M_OUT2, + S2MPG10_PMIC_B3M_CTRL, + S2MPG10_PMIC_B3M_OUT1, + S2MPG10_PMIC_B3M_OUT2, + S2MPG10_PMIC_B4M_CTRL, + S2MPG10_PMIC_B4M_OUT1, + S2MPG10_PMIC_B4M_OUT2, + S2MPG10_PMIC_B5M_CTRL, + S2MPG10_PMIC_B5M_OUT1, + S2MPG10_PMIC_B5M_OUT2, + S2MPG10_PMIC_B6M_CTRL, + S2MPG10_PMIC_B6M_OUT1, + S2MPG10_PMIC_B6M_OUT2, + S2MPG10_PMIC_B7M_CTRL, + S2MPG10_PMIC_B7M_OUT1, + S2MPG10_PMIC_B7M_OUT2, + S2MPG10_PMIC_B8M_CTRL, + S2MPG10_PMIC_B8M_OUT1, + S2MPG10_PMIC_B8M_OUT2, + S2MPG10_PMIC_B9M_CTRL, + S2MPG10_PMIC_B9M_OUT1, + S2MPG10_PMIC_B9M_OUT2, + S2MPG10_PMIC_B10M_CTRL, + S2MPG10_PMIC_B10M_OUT1, + S2MPG10_PMIC_B10M_OUT2, + S2MPG10_PMIC_BUCK1M_USONIC, + S2MPG10_PMIC_BUCK2M_USONIC, + S2MPG10_PMIC_BUCK3M_USONIC, + S2MPG10_PMIC_BUCK4M_USONIC, + S2MPG10_PMIC_BUCK5M_USONIC, + S2MPG10_PMIC_BUCK6M_USONIC, + S2MPG10_PMIC_BUCK7M_USONIC, + S2MPG10_PMIC_BUCK8M_USONIC, + S2MPG10_PMIC_BUCK9M_USONIC, + S2MPG10_PMIC_BUCK10M_USONIC, + S2MPG10_PMIC_L1M_CTRL, + S2MPG10_PMIC_L2M_CTRL, + S2MPG10_PMIC_L3M_CTRL, + S2MPG10_PMIC_L4M_CTRL, + S2MPG10_PMIC_L5M_CTRL, + S2MPG10_PMIC_L6M_CTRL, + S2MPG10_PMIC_L7M_CTRL, + S2MPG10_PMIC_L8M_CTRL, + S2MPG10_PMIC_L9M_CTRL, + S2MPG10_PMIC_L10M_CTRL, + S2MPG10_PMIC_L11M_CTRL1, + S2MPG10_PMIC_L11M_CTRL2, + S2MPG10_PMIC_L12M_CTRL1, + S2MPG10_PMIC_L12M_CTRL2, + S2MPG10_PMIC_L13M_CTRL1, + S2MPG10_PMIC_L13M_CTRL2, + S2MPG10_PMIC_L14M_CTRL, + S2MPG10_PMIC_L15M_CTRL1, + S2MPG10_PMIC_L15M_CTRL2, + S2MPG10_PMIC_L16M_CTRL, + S2MPG10_PMIC_L17M_CTRL, + S2MPG10_PMIC_L18M_CTRL, + S2MPG10_PMIC_L19M_CTRL, + S2MPG10_PMIC_L20M_CTRL, + S2MPG10_PMIC_L21M_CTRL, + S2MPG10_PMIC_L22M_CTRL, + S2MPG10_PMIC_L23M_CTRL, + S2MPG10_PMIC_L24M_CTRL, + S2MPG10_PMIC_L25M_CTRL, + S2MPG10_PMIC_L26M_CTRL, + S2MPG10_PMIC_L27M_CTRL, + S2MPG10_PMIC_L28M_CTRL, + S2MPG10_PMIC_L29M_CTRL, + S2MPG10_PMIC_L30M_CTRL, + S2MPG10_PMIC_L31M_CTRL, + S2MPG10_PMIC_LDO_CTRL1, + S2MPG10_PMIC_LDO_CTRL2, + S2MPG10_PMIC_LDO_DSCH1, + S2MPG10_PMIC_LDO_DSCH2, + S2MPG10_PMIC_LDO_DSCH3, + S2MPG10_PMIC_LDO_DSCH4, + S2MPG10_PMIC_LDO_BUCK7M_HLIMIT, + S2MPG10_PMIC_LDO_BUCK7M_LLIMIT, + S2MPG10_PMIC_LDO_LDO21M_HLIMIT, + S2MPG10_PMIC_LDO_LDO21M_LLIMIT, + S2MPG10_PMIC_LDO_LDO11M_HLIMIT, + S2MPG10_PMIC_DVS_RAMP1, + S2MPG10_PMIC_DVS_RAMP2, + S2MPG10_PMIC_DVS_RAMP3, + S2MPG10_PMIC_DVS_RAMP4, + S2MPG10_PMIC_DVS_RAMP5, + S2MPG10_PMIC_DVS_RAMP6, + S2MPG10_PMIC_DVS_SYNC_CTRL1, + S2MPG10_PMIC_DVS_SYNC_CTRL2, + S2MPG10_PMIC_DVS_SYNC_CTRL3, + S2MPG10_PMIC_DVS_SYNC_CTRL4, + S2MPG10_PMIC_DVS_SYNC_CTRL5, + S2MPG10_PMIC_DVS_SYNC_CTRL6, + S2MPG10_PMIC_OFF_CTRL1, + S2MPG10_PMIC_OFF_CTRL2, + S2MPG10_PMIC_OFF_CTRL3, + S2MPG10_PMIC_OFF_CTRL4, + S2MPG10_PMIC_SEQ_CTRL1, + S2MPG10_PMIC_SEQ_CTRL2, + S2MPG10_PMIC_SEQ_CTRL3, + S2MPG10_PMIC_SEQ_CTRL4, + S2MPG10_PMIC_SEQ_CTRL5, + S2MPG10_PMIC_SEQ_CTRL6, + S2MPG10_PMIC_SEQ_CTRL7, + S2MPG10_PMIC_SEQ_CTRL8, + S2MPG10_PMIC_SEQ_CTRL9, + S2MPG10_PMIC_SEQ_CTRL10, + S2MPG10_PMIC_SEQ_CTRL11, + S2MPG10_PMIC_SEQ_CTRL12, + S2MPG10_PMIC_SEQ_CTRL13, + S2MPG10_PMIC_SEQ_CTRL14, + S2MPG10_PMIC_SEQ_CTRL15, + S2MPG10_PMIC_SEQ_CTRL16, + S2MPG10_PMIC_SEQ_CTRL17, + S2MPG10_PMIC_SEQ_CTRL18, + S2MPG10_PMIC_SEQ_CTRL19, + S2MPG10_PMIC_SEQ_CTRL20, + S2MPG10_PMIC_SEQ_CTRL21, + S2MPG10_PMIC_SEQ_CTRL22, + S2MPG10_PMIC_SEQ_CTRL23, + S2MPG10_PMIC_SEQ_CTRL24, + S2MPG10_PMIC_SEQ_CTRL25, + S2MPG10_PMIC_SEQ_CTRL26, + S2MPG10_PMIC_SEQ_CTRL27, + S2MPG10_PMIC_SEQ_CTRL28, + S2MPG10_PMIC_SEQ_CTRL29, + S2MPG10_PMIC_SEQ_CTRL30, + S2MPG10_PMIC_SEQ_CTRL31, + S2MPG10_PMIC_SEQ_CTRL32, + S2MPG10_PMIC_SEQ_CTRL33, + S2MPG10_PMIC_SEQ_CTRL34, + S2MPG10_PMIC_SEQ_CTRL35, + S2MPG10_PMIC_OFF_SEQ_CTRL1, + S2MPG10_PMIC_OFF_SEQ_CTRL2, + S2MPG10_PMIC_OFF_SEQ_CTRL3, + S2MPG10_PMIC_OFF_SEQ_CTRL4, + S2MPG10_PMIC_OFF_SEQ_CTRL5, + S2MPG10_PMIC_OFF_SEQ_CTRL6, + S2MPG10_PMIC_OFF_SEQ_CTRL7, + S2MPG10_PMIC_OFF_SEQ_CTRL8, + S2MPG10_PMIC_OFF_SEQ_CTRL9, + S2MPG10_PMIC_OFF_SEQ_CTRL10, + S2MPG10_PMIC_OFF_SEQ_CTRL11, + S2MPG10_PMIC_OFF_SEQ_CTRL12, + S2MPG10_PMIC_OFF_SEQ_CTRL13, + S2MPG10_PMIC_OFF_SEQ_CTRL14, + S2MPG10_PMIC_OFF_SEQ_CTRL15, + S2MPG10_PMIC_OFF_SEQ_CTRL16, + S2MPG10_PMIC_OFF_SEQ_CTRL17, + S2MPG10_PMIC_OFF_SEQ_CTRL18, + S2MPG10_PMIC_PCTRLSEL1, + S2MPG10_PMIC_PCTRLSEL2, + S2MPG10_PMIC_PCTRLSEL3, + S2MPG10_PMIC_PCTRLSEL4, + S2MPG10_PMIC_PCTRLSEL5, + S2MPG10_PMIC_PCTRLSEL6, + S2MPG10_PMIC_PCTRLSEL7, + S2MPG10_PMIC_PCTRLSEL8, + S2MPG10_PMIC_PCTRLSEL9, + S2MPG10_PMIC_PCTRLSEL10, + S2MPG10_PMIC_PCTRLSEL11, + S2MPG10_PMIC_PCTRLSEL12, + S2MPG10_PMIC_PCTRLSEL13, + S2MPG10_PMIC_DCTRLSEL1, + S2MPG10_PMIC_DCTRLSEL2, + S2MPG10_PMIC_DCTRLSEL3, + S2MPG10_PMIC_DCTRLSEL4, + S2MPG10_PMIC_DCTRLSEL5, + S2MPG10_PMIC_DCTRLSEL6, + S2MPG10_PMIC_DCTRLSEL7, + S2MPG10_PMIC_GPIO_CTRL1, + S2MPG10_PMIC_GPIO_CTRL2, + S2MPG10_PMIC_GPIO_CTRL3, + S2MPG10_PMIC_GPIO_CTRL4, + S2MPG10_PMIC_GPIO_CTRL5, + S2MPG10_PMIC_GPIO_CTRL6, + S2MPG10_PMIC_GPIO_CTRL7, + S2MPG10_PMIC_B2M_OCP_WARN, + S2MPG10_PMIC_B2M_OCP_WARN_X, + S2MPG10_PMIC_B2M_OCP_WARN_Y, + S2MPG10_PMIC_B2M_OCP_WARN_Z, + S2MPG10_PMIC_B3M_OCP_WARN, + S2MPG10_PMIC_B3M_OCP_WARN_X, + S2MPG10_PMIC_B3M_OCP_WARN_Y, + S2MPG10_PMIC_B3M_OCP_WARN_Z, + S2MPG10_PMIC_B10M_OCP_WARN, + S2MPG10_PMIC_B10M_OCP_WARN_X, + S2MPG10_PMIC_B10M_OCP_WARN_Y, + S2MPG10_PMIC_B10M_OCP_WARN_Z, + S2MPG10_PMIC_B2M_SOFT_OCP_WARN, + S2MPG10_PMIC_B2M_SOFT_OCP_WARN_X, + S2MPG10_PMIC_B2M_SOFT_OCP_WARN_Y, + S2MPG10_PMIC_B2M_SOFT_OCP_WARN_Z, + S2MPG10_PMIC_B3M_SOFT_OCP_WARN, + S2MPG10_PMIC_B3M_SOFT_OCP_WARN_X, + S2MPG10_PMIC_B3M_SOFT_OCP_WARN_Y, + S2MPG10_PMIC_B3M_SOFT_OCP_WARN_Z, + S2MPG10_PMIC_B10M_SOFT_OCP_WARN, + S2MPG10_PMIC_B10M_SOFT_OCP_WARN_X, + S2MPG10_PMIC_B10M_SOFT_OCP_WARN_Y, + S2MPG10_PMIC_B10M_SOFT_OCP_WARN_Z, + S2MPG10_PMIC_BUCK_OCP_EN1, + S2MPG10_PMIC_BUCK_OCP_EN2, + S2MPG10_PMIC_BUCK_OCP_PD_EN1, + S2MPG10_PMIC_BUCK_OCP_PD_EN2, + S2MPG10_PMIC_BUCK_OCP_CTRL1, + S2MPG10_PMIC_BUCK_OCP_CTRL2, + S2MPG10_PMIC_BUCK_OCP_CTRL3, + S2MPG10_PMIC_BUCK_OCP_CTRL4, + S2MPG10_PMIC_BUCK_OCP_CTRL5, + S2MPG10_PMIC_PIF_CTRL, + S2MPG10_PMIC_BUCK_HR_MODE1, + S2MPG10_PMIC_BUCK_HR_MODE2, + S2MPG10_PMIC_FAULTOUT_CTRL, + S2MPG10_PMIC_LDO_SENSE1, + S2MPG10_PMIC_LDO_SENSE2, + S2MPG10_PMIC_LDO_SENSE3, + S2MPG10_PMIC_LDO_SENSE4, +}; + +/* Meter registers (type 0xa00) */ +enum s2mpg10_meter_reg { + S2MPG10_METER_CTRL1, + S2MPG10_METER_CTRL2, + S2MPG10_METER_CTRL3, + S2MPG10_METER_CTRL4, + S2MPG10_METER_BUCKEN1, + S2MPG10_METER_BUCKEN2, + S2MPG10_METER_MUXSEL0, + S2MPG10_METER_MUXSEL1, + S2MPG10_METER_MUXSEL2, + S2MPG10_METER_MUXSEL3, + S2MPG10_METER_MUXSEL4, + S2MPG10_METER_MUXSEL5, + S2MPG10_METER_MUXSEL6, + S2MPG10_METER_MUXSEL7, + S2MPG10_METER_LPF_C0_0, + S2MPG10_METER_LPF_C0_1, + S2MPG10_METER_LPF_C0_2, + S2MPG10_METER_LPF_C0_3, + S2MPG10_METER_LPF_C0_4, + S2MPG10_METER_LPF_C0_5, + S2MPG10_METER_LPF_C0_6, + S2MPG10_METER_LPF_C0_7, + S2MPG10_METER_PWR_WARN0, + S2MPG10_METER_PWR_WARN1, + S2MPG10_METER_PWR_WARN2, + S2MPG10_METER_PWR_WARN3, + S2MPG10_METER_PWR_WARN4, + S2MPG10_METER_PWR_WARN5, + S2MPG10_METER_PWR_WARN6, + S2MPG10_METER_PWR_WARN7, + S2MPG10_METER_PWR_HYS1, + S2MPG10_METER_PWR_HYS2, + S2MPG10_METER_PWR_HYS3, + S2MPG10_METER_PWR_HYS4, + S2MPG10_METER_ACC_DATA_CH0_1 = 0x40, + S2MPG10_METER_ACC_DATA_CH0_2, + S2MPG10_METER_ACC_DATA_CH0_3, + S2MPG10_METER_ACC_DATA_CH0_4, + S2MPG10_METER_ACC_DATA_CH0_5, + S2MPG10_METER_ACC_DATA_CH0_6, + S2MPG10_METER_ACC_DATA_CH1_1, + S2MPG10_METER_ACC_DATA_CH1_2, + S2MPG10_METER_ACC_DATA_CH1_3, + S2MPG10_METER_ACC_DATA_CH1_4, + S2MPG10_METER_ACC_DATA_CH1_5, + S2MPG10_METER_ACC_DATA_CH1_6, + S2MPG10_METER_ACC_DATA_CH2_1, + S2MPG10_METER_ACC_DATA_CH2_2, + S2MPG10_METER_ACC_DATA_CH2_3, + S2MPG10_METER_ACC_DATA_CH2_4, + S2MPG10_METER_ACC_DATA_CH2_5, + S2MPG10_METER_ACC_DATA_CH2_6, + S2MPG10_METER_ACC_DATA_CH3_1, + S2MPG10_METER_ACC_DATA_CH3_2, + S2MPG10_METER_ACC_DATA_CH3_3, + S2MPG10_METER_ACC_DATA_CH3_4, + S2MPG10_METER_ACC_DATA_CH3_5, + S2MPG10_METER_ACC_DATA_CH3_6, + S2MPG10_METER_ACC_DATA_CH4_1, + S2MPG10_METER_ACC_DATA_CH4_2, + S2MPG10_METER_ACC_DATA_CH4_3, + S2MPG10_METER_ACC_DATA_CH4_4, + S2MPG10_METER_ACC_DATA_CH4_5, + S2MPG10_METER_ACC_DATA_CH4_6, + S2MPG10_METER_ACC_DATA_CH5_1, + S2MPG10_METER_ACC_DATA_CH5_2, + S2MPG10_METER_ACC_DATA_CH5_3, + S2MPG10_METER_ACC_DATA_CH5_4, + S2MPG10_METER_ACC_DATA_CH5_5, + S2MPG10_METER_ACC_DATA_CH5_6, + S2MPG10_METER_ACC_DATA_CH6_1, + S2MPG10_METER_ACC_DATA_CH6_2, + S2MPG10_METER_ACC_DATA_CH6_3, + S2MPG10_METER_ACC_DATA_CH6_4, + S2MPG10_METER_ACC_DATA_CH6_5, + S2MPG10_METER_ACC_DATA_CH6_6, + S2MPG10_METER_ACC_DATA_CH7_1, + S2MPG10_METER_ACC_DATA_CH7_2, + S2MPG10_METER_ACC_DATA_CH7_3, + S2MPG10_METER_ACC_DATA_CH7_4, + S2MPG10_METER_ACC_DATA_CH7_5, + S2MPG10_METER_ACC_DATA_CH7_6, + S2MPG10_METER_ACC_COUNT_1, + S2MPG10_METER_ACC_COUNT_2, + S2MPG10_METER_ACC_COUNT_3, + S2MPG10_METER_LPF_DATA_CH0_1, + S2MPG10_METER_LPF_DATA_CH0_2, + S2MPG10_METER_LPF_DATA_CH0_3, + S2MPG10_METER_LPF_DATA_CH1_1, + S2MPG10_METER_LPF_DATA_CH1_2, + S2MPG10_METER_LPF_DATA_CH1_3, + S2MPG10_METER_LPF_DATA_CH2_1, + S2MPG10_METER_LPF_DATA_CH2_2, + S2MPG10_METER_LPF_DATA_CH2_3, + S2MPG10_METER_LPF_DATA_CH3_1, + S2MPG10_METER_LPF_DATA_CH3_2, + S2MPG10_METER_LPF_DATA_CH3_3, + S2MPG10_METER_LPF_DATA_CH4_1, + S2MPG10_METER_LPF_DATA_CH4_2, + S2MPG10_METER_LPF_DATA_CH4_3, + S2MPG10_METER_LPF_DATA_CH5_1, + S2MPG10_METER_LPF_DATA_CH5_2, + S2MPG10_METER_LPF_DATA_CH5_3, + S2MPG10_METER_LPF_DATA_CH6_1, + S2MPG10_METER_LPF_DATA_CH6_2, + S2MPG10_METER_LPF_DATA_CH6_3, + S2MPG10_METER_LPF_DATA_CH7_1, + S2MPG10_METER_LPF_DATA_CH7_2, + S2MPG10_METER_LPF_DATA_CH7_3, + S2MPG10_METER_DSM_TRIM_OFFSET = 0xee, + S2MPG10_METER_BUCK_METER_TRIM3 = 0xf1, +}; + +/* S2MPG10 regulator IDs */ +enum s2mpg10_regulators { + S2MPG10_LDO1, + S2MPG10_LDO2, + S2MPG10_LDO3, + S2MPG10_LDO4, + S2MPG10_LDO5, + S2MPG10_LDO6, + S2MPG10_LDO7, + S2MPG10_LDO8, + S2MPG10_LDO9, + S2MPG10_LDO10, + S2MPG10_LDO11, + S2MPG10_LDO12, + S2MPG10_LDO13, + S2MPG10_LDO14, + S2MPG10_LDO15, + S2MPG10_LDO16, + S2MPG10_LDO17, + S2MPG10_LDO18, + S2MPG10_LDO19, + S2MPG10_LDO20, + S2MPG10_LDO21, + S2MPG10_LDO22, + S2MPG10_LDO23, + S2MPG10_LDO24, + S2MPG10_LDO25, + S2MPG10_LDO26, + S2MPG10_LDO27, + S2MPG10_LDO28, + S2MPG10_LDO29, + S2MPG10_LDO30, + S2MPG10_LDO31, + S2MPG10_BUCK1, + S2MPG10_BUCK2, + S2MPG10_BUCK3, + S2MPG10_BUCK4, + S2MPG10_BUCK5, + S2MPG10_BUCK6, + S2MPG10_BUCK7, + S2MPG10_BUCK8, + S2MPG10_BUCK9, + S2MPG10_BUCK10, + S2MPG10_REGULATOR_MAX, +}; + +#endif /* __LINUX_MFD_S2MPG10_H */ -- cgit v1.2.3 From adf91d9e1983dec3d5fabc273e8ff89918b852c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 9 Apr 2025 21:37:39 +0100 Subject: mfd: sec: Change device_type to int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that sec-i2c doesn't match device type by pointer casting anymore, we can switch the device type from unsigned long to int easily. This saves a few bytes in struct sec_pmic_dev due to member alignment. Signed-off-by: André Draszik Link: https://lore.kernel.org/r/20250409-s2mpg10-v4-18-d66d5f39b6bf@linaro.org Signed-off-by: Lee Jones --- include/linux/mfd/samsung/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/samsung/core.h b/include/linux/mfd/samsung/core.h index c1102324172a..d785e101fe79 100644 --- a/include/linux/mfd/samsung/core.h +++ b/include/linux/mfd/samsung/core.h @@ -67,7 +67,7 @@ struct sec_pmic_dev { struct regmap *regmap_pmic; struct i2c_client *i2c; - unsigned long device_type; + int device_type; int irq; struct regmap_irq_chip_data *irq_data; }; -- cgit v1.2.3 From 34c5f34df95f71fc500ff0e942210ebc97d2ef65 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 9 May 2025 18:35:20 +0100 Subject: mfd: sm501: Remove unused sm501_find_clock sm501_find_clock() was added in 2007 as part of commit b6d6454fdb66 ("[PATCH] mfd: SM501 core driver") but hasn't been used. Remove it. Signed-off-by: "Dr. David Alan Gilbert" Link: https://lore.kernel.org/r/20250509173521.49596-1-linux@treblig.org Signed-off-by: Lee Jones --- include/linux/sm501.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sm501.h b/include/linux/sm501.h index 2f3488b2875d..bcda27a46e7a 100644 --- a/include/linux/sm501.h +++ b/include/linux/sm501.h @@ -12,9 +12,6 @@ extern int sm501_unit_power(struct device *dev, extern unsigned long sm501_set_clock(struct device *dev, int clksrc, unsigned long freq); -extern unsigned long sm501_find_clock(struct device *dev, - int clksrc, unsigned long req_freq); - /* sm501_misc_control * * Modify the SM501's MISC_CONTROL register -- cgit v1.2.3 From db26d62d79e4068934ad0dccdb92715df36352b9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 May 2025 08:57:52 +0100 Subject: netfs: Fix undifferentiation of DIO reads from unbuffered reads On cifs, "DIO reads" (specified by O_DIRECT) need to be differentiated from "unbuffered reads" (specified by cache=none in the mount parameters). The difference is flagged in the protocol and the server may behave differently: Windows Server will, for example, mandate that DIO reads are block aligned. Fix this by adding a NETFS_UNBUFFERED_READ to differentiate this from NETFS_DIO_READ, parallelling the write differentiation that already exists. cifs will then do the right thing. Fixes: 016dc8516aec ("netfs: Implement unbuffered/DIO read support") Signed-off-by: David Howells Link: https://lore.kernel.org/3444961.1747987072@warthog.procyon.org.uk Reviewed-by: "Paulo Alcantara (Red Hat)" Reviewed-by: Viacheslav Dubeyko cc: Steve French cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: ceph-devel@vger.kernel.org cc: linux-nfs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 7a649cfedc09..065c17385e53 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -203,6 +203,7 @@ enum netfs_io_origin { NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ NETFS_READ_SINGLE, /* This read should be treated as a single object */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ + NETFS_UNBUFFERED_READ, /* This is an unbuffered read */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITEBACK_SINGLE, /* This monolithic write was triggered by writepages */ -- cgit v1.2.3 From 4e83ae6ec87dddac070ba349d3b839589b1bb957 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 May 2025 10:47:06 +0200 Subject: mips, net: ensure that SOCK_COREDUMP is defined For historical reasons mips has to override the socket enum values but the defines are all the same. So simply move the ARCH_HAS_SOCKET_TYPES scope. Fixes: a9194f88782a ("coredump: add coredump socket") Suggested-by: Arnd Bergmann Signed-off-by: Christian Brauner --- include/linux/net.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 139c85d0f2ea..f60fff91e1df 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -70,6 +70,7 @@ enum sock_type { SOCK_DCCP = 6, SOCK_PACKET = 10, }; +#endif /* ARCH_HAS_SOCKET_TYPES */ #define SOCK_MAX (SOCK_PACKET + 1) /* Mask which covers at least up to SOCK_MASK-1. The @@ -83,8 +84,6 @@ enum sock_type { #endif #define SOCK_COREDUMP O_NOCTTY -#endif /* ARCH_HAS_SOCKET_TYPES */ - /** * enum sock_shutdown_cmd - Shutdown types * @SHUT_RD: shutdown receptions -- cgit v1.2.3 From 0e81cfd971dc4833c699dcd8924e54a5021bc4e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:57 -0700 Subject: af_unix: Move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. As explained in the next patch, SO_PASSRIGHTS would have a problem if we assigned a corresponding bit to socket->flags, so it must be managed in struct sock. Mixing socket->flags and sk->sk_flags for similar options will look confusing, and sk->sk_flags does not have enough space on 32bit system. Also, as mentioned in commit 16e572626961 ("af_unix: dont send SCM_CREDENTIALS by default"), SOCK_PASSCRED and SOCK_PASSPID handling is known to be slow, and managing the flags in struct socket cannot avoid that for embryo sockets. Let's move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. While at it, other SOCK_XXX flags in net.h are grouped as enum. Note that assign_bit() was atomic, so the writer side is moved down after lock_sock() in setsockopt(), but the bit is only read once in sendmsg() and recvmsg(), so lock_sock() is not needed there. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/net.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 0ff950eecc6b..f8418d6e33e0 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -36,14 +36,13 @@ struct net; * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. * Eventually all flags will be in sk->sk_wq->flags. */ -#define SOCKWQ_ASYNC_NOSPACE 0 -#define SOCKWQ_ASYNC_WAITDATA 1 -#define SOCK_NOSPACE 2 -#define SOCK_PASSCRED 3 -#define SOCK_PASSSEC 4 -#define SOCK_SUPPORT_ZC 5 -#define SOCK_CUSTOM_SOCKOPT 6 -#define SOCK_PASSPIDFD 7 +enum socket_flags { + SOCKWQ_ASYNC_NOSPACE, + SOCKWQ_ASYNC_WAITDATA, + SOCK_NOSPACE, + SOCK_SUPPORT_ZC, + SOCK_CUSTOM_SOCKOPT, +}; #ifndef ARCH_HAS_SOCKET_TYPES /** -- cgit v1.2.3 From d1d89e8eee6f0e91cfad2a0375ad679fd5c1ae83 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 21 May 2025 15:41:46 +0200 Subject: USB: gadget: fix up const issue with struct usb_function_instance In struct usb_function, the struct usb_function_instance pointer variable "fi" is listed as const, but it is written to in numerous places, making the const marking of it a total lie. Fix this up by just removing the const pointer attribute as this is modified in numerous places. Link: https://lore.kernel.org/r/2025052145-undress-puma-f7cf@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/composite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index 6e38fb9d2117..d8c4e9f73839 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -237,7 +237,7 @@ struct usb_function { /* internals */ struct list_head list; DECLARE_BITMAP(endpoints, 32); - const struct usb_function_instance *fi; + struct usb_function_instance *fi; unsigned int bind_deactivated:1; }; -- cgit v1.2.3 From a1f1acb9c5db9b385c9b3eb1f27f897c06df49ae Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 12:28:44 +0200 Subject: netfilter: nf_dup{4, 6}: Move duplication check to task_struct nf_skb_duplicated is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Due to the recursion involved, the simplest change is to make it a per-task variable. Move the per-CPU variable nf_skb_duplicated to task_struct and name it in_nf_duplicate. Add it to the existing bitfield so it doesn't use additional memory. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Acked-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 11 ----------- include/linux/sched.h | 1 + 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2b8aac2c70ad..892d12823ed4 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -497,17 +497,6 @@ struct nf_defrag_hook { extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; -/* - * nf_skb_duplicated - TEE target has sent a packet - * - * When a xtables target sends a packet, the OUTPUT and POSTROUTING - * hooks are traversed again, i.e. nft and xtables are invoked recursively. - * - * This is used by xtables TEE target to prevent the duplicated skb from - * being duplicated again. - */ -DECLARE_PER_CPU(bool, nf_skb_duplicated); - /* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..52d9c52dc8f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1044,6 +1044,7 @@ struct task_struct { /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif + unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif -- cgit v1.2.3 From f37ad91270397a6d053e8623bdb3cf79859691d2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 12:28:46 +0200 Subject: netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit nf_dup_skb_recursion is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move nf_dup_skb_recursion to struct netdev_xmit, provide wrappers. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice_xmit.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 848735b3a7c0..813a19122ebb 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -11,6 +11,9 @@ struct netdev_xmit { #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) u8 sched_mirred_nest; #endif +#if IS_ENABLED(CONFIG_NF_DUP_NETDEV) + u8 nf_dup_skb_recursion; +#endif }; #endif -- cgit v1.2.3 From 90869f43d06dfc836def2f53850a878f829e443e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 May 2025 15:49:33 +0200 Subject: netfilter: conntrack: make nf_conntrack_id callable without a module dependency While nf_conntrack_id() doesn't need any functionaliy from conntrack, it does reside in nf_conntrack_core.c -- callers add a module dependency on conntrack. Followup patch will need to compute the conntrack id from nf_tables_trace.c to include it in nf_trace messages emitted to userspace via netlink. I don't want to introduce a module dependency between nf_tables and conntrack for this. Since trace is slowpath, the added indirection is ok. One alternative is to move nf_conntrack_id to the netfilter/core.c, but I don't see a compelling reason so far. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 892d12823ed4..20947f2c685b 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -470,6 +470,7 @@ struct nf_ct_hook { void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); int (*confirm)(struct sk_buff *skb); + u32 (*get_id)(const struct nf_conntrack *nfct); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; -- cgit v1.2.3 From 73319a8ee18b9cf0b2dac87f8521595e0381ba0c Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 21 May 2025 22:44:26 +0200 Subject: netfilter: nf_tables: Have a list of nf_hook_ops in nft_hook Supporting a 1:n relationship between nft_hook and nf_hook_ops is convenient since a chain's or flowtable's nft_hooks may remain in place despite matching interfaces disappearing. This stabilizes ruleset dumps in that regard and opens the possibility to claim newly added interfaces which match the spec. Also it prepares for wildcard interface specs since these will potentially match multiple interfaces. All spots dealing with hook registration are updated to handle a list of multiple nf_hook_ops, but nft_netdev_hook_alloc() only adds a single item for now to retain the old behaviour. The only expected functional change here is how vanishing interfaces are handled: Instead of dropping the respective nft_hook, only the matching nf_hook_ops are dropped. To safely remove individual ops from the list in netdev handlers, an rcu_head is added to struct nf_hook_ops so kfree_rcu() may be used. There is at least nft_flowtable_find_dev() which may be iterating through the list at the same time. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 20947f2c685b..5f896fcc074d 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -95,6 +95,9 @@ enum nf_hook_ops_type { }; struct nf_hook_ops { + struct list_head list; + struct rcu_head rcu; + /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; -- cgit v1.2.3 From 101f2bbab541116ab861b9c3ac0ece07a7eaa756 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Wed, 7 May 2025 15:34:01 -0700 Subject: fs: convert mount flags to enum In prior kernel versions (5.8-6.8), commit 9f6c61f96f2d9 ("proc/mounts: add cursor") introduced MNT_CURSOR, a flag used by readers from /proc/mounts to keep their place while reading the file. Later, commit 2eea9ce4310d8 ("mounts: keep list of mounts in an rbtree") removed this flag and its value has since been repurposed. For debuggers iterating over the list of mounts, cursors should be skipped as they are irrelevant. Detecting whether an element is a cursor can be difficult. Since the MNT_CURSOR flag is a preprocessor constant, it's not present in debuginfo, and since its value is repurposed, we cannot hard-code it. For this specific issue, cursors are possible to detect in other ways, but ideally, we would be able to read the mount flag definitions out of the debuginfo. For that reason, convert the mount flags to an enum. Link: https://github.com/osandov/drgn/pull/496 Signed-off-by: Stephen Brennan Link: https://lore.kernel.org/20250507223402.2795029-1-stephen.s.brennan@oracle.com Signed-off-by: Christian Brauner --- include/linux/mount.h | 87 ++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index dcc17ce8a959..6904ad33ee7a 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -22,48 +22,51 @@ struct fs_context; struct file; struct path; -#define MNT_NOSUID 0x01 -#define MNT_NODEV 0x02 -#define MNT_NOEXEC 0x04 -#define MNT_NOATIME 0x08 -#define MNT_NODIRATIME 0x10 -#define MNT_RELATIME 0x20 -#define MNT_READONLY 0x40 /* does the user want this to be r/o? */ -#define MNT_NOSYMFOLLOW 0x80 - -#define MNT_SHRINKABLE 0x100 -#define MNT_WRITE_HOLD 0x200 - -#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ -#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ -/* - * MNT_SHARED_MASK is the set of flags that should be cleared when a - * mount becomes shared. Currently, this is only the flag that says a - * mount cannot be bind mounted, since this is how we create a mount - * that shares events with another mount. If you add a new MNT_* - * flag, consider how it interacts with shared mounts. - */ -#define MNT_SHARED_MASK (MNT_UNBINDABLE) -#define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ - | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ - | MNT_READONLY | MNT_NOSYMFOLLOW) -#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) - -#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) - -#define MNT_INTERNAL 0x4000 - -#define MNT_LOCK_ATIME 0x040000 -#define MNT_LOCK_NOEXEC 0x080000 -#define MNT_LOCK_NOSUID 0x100000 -#define MNT_LOCK_NODEV 0x200000 -#define MNT_LOCK_READONLY 0x400000 -#define MNT_LOCKED 0x800000 -#define MNT_DOOMED 0x1000000 -#define MNT_SYNC_UMOUNT 0x2000000 -#define MNT_MARKED 0x4000000 -#define MNT_UMOUNT 0x8000000 +enum mount_flags { + MNT_NOSUID = 0x01, + MNT_NODEV = 0x02, + MNT_NOEXEC = 0x04, + MNT_NOATIME = 0x08, + MNT_NODIRATIME = 0x10, + MNT_RELATIME = 0x20, + MNT_READONLY = 0x40, /* does the user want this to be r/o? */ + MNT_NOSYMFOLLOW = 0x80, + + MNT_SHRINKABLE = 0x100, + MNT_WRITE_HOLD = 0x200, + + MNT_SHARED = 0x1000, /* if the vfsmount is a shared mount */ + MNT_UNBINDABLE = 0x2000, /* if the vfsmount is a unbindable mount */ + + MNT_INTERNAL = 0x4000, + + MNT_LOCK_ATIME = 0x040000, + MNT_LOCK_NOEXEC = 0x080000, + MNT_LOCK_NOSUID = 0x100000, + MNT_LOCK_NODEV = 0x200000, + MNT_LOCK_READONLY = 0x400000, + MNT_LOCKED = 0x800000, + MNT_DOOMED = 0x1000000, + MNT_SYNC_UMOUNT = 0x2000000, + MNT_MARKED = 0x4000000, + MNT_UMOUNT = 0x8000000, + + /* + * MNT_SHARED_MASK is the set of flags that should be cleared when a + * mount becomes shared. Currently, this is only the flag that says a + * mount cannot be bind mounted, since this is how we create a mount + * that shares events with another mount. If you add a new MNT_* + * flag, consider how it interacts with shared mounts. + */ + MNT_SHARED_MASK = MNT_UNBINDABLE, + MNT_USER_SETTABLE_MASK = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC + | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME + | MNT_READONLY | MNT_NOSYMFOLLOW, + MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, + + MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED, +}; struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ -- cgit v1.2.3 From 09683a6184ad6fd635831402316651392b56cc3a Mon Sep 17 00:00:00 2001 From: Karolina Stolarek Date: Thu, 22 May 2025 18:21:21 -0500 Subject: PCI/AER: Rename struct aer_stats to aer_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update name to reflect the broader definition of structs/variables that are stored (e.g. ratelimits). This is a preparatory patch for adding rate limit support. [bhelgaas: "aer_report" -> "aer_info"] Signed-off-by: Karolina Stolarek Signed-off-by: Bjorn Helgaas Tested-by: Krzysztof Wilczyński Reviewed-by: Ilpo Järvinen Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250522232339.1525671-16-helgaas@kernel.org --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..81a81dbfc873 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -346,7 +346,7 @@ struct pci_dev { u8 hdr_type; /* PCI header type (`multi' flag masked out) */ #ifdef CONFIG_PCIEAER u16 aer_cap; /* AER capability offset */ - struct aer_stats *aer_stats; /* AER stats for this device */ + struct aer_info *aer_info; /* AER info for this device */ #endif #ifdef CONFIG_PCIEPORTBUS struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */ -- cgit v1.2.3 From 13423063c7cb7d1c34104a78cae85eb0281bae90 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Mon, 28 Apr 2025 14:07:32 -0700 Subject: arm64: kvm, smccc: Introduce and use API for getting hypervisor UUID The KVM/arm64 uses SMCCC to detect hypervisor presence. That code is private, and it follows the SMCCC specification. Other existing and emerging hypervisor guest implementations can and should use that standard approach as well. Factor out a common infrastructure that the guests can use, update KVM to employ the new API. The central notion of the SMCCC method is the UUID of the hypervisor, and the new API follows that. No functional changes. Validated with a KVM/arm64 guest. Signed-off-by: Roman Kisel Acked-by: Arnd Bergmann Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20250428210742.435282-2-romank@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20250428210742.435282-2-romank@linux.microsoft.com> --- include/linux/arm-smccc.h | 64 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index a3863da1510e..784ebe4607a4 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -7,6 +7,11 @@ #include #include + +#ifndef __ASSEMBLY__ +#include +#endif + #include /* @@ -107,10 +112,10 @@ ARM_SMCCC_FUNC_QUERY_CALL_UID) /* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */ -#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U -#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 0xe911c52eU -#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 0x564bcaa9U -#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3 0x743a004dU +#define ARM_SMCCC_VENDOR_HYP_UID_KVM UUID_INIT(\ + 0xb66fb428, 0xc52e, 0xe911, \ + 0xa9, 0xca, 0x4b, 0x56, \ + 0x4d, 0x00, 0x3a, 0x74) /* KVM "vendor specific" services */ #define ARM_SMCCC_KVM_FUNC_FEATURES 0 @@ -348,6 +353,57 @@ s32 arm_smccc_get_soc_id_version(void); */ s32 arm_smccc_get_soc_id_revision(void); +#ifndef __ASSEMBLY__ + +/* + * Returns whether a specific hypervisor UUID is advertised for the + * Vendor Specific Hypervisor Service range. + */ +bool arm_smccc_hypervisor_has_uuid(const uuid_t *uuid); + +static inline uuid_t smccc_res_to_uuid(u32 r0, u32 r1, u32 r2, u32 r3) +{ + uuid_t uuid = { + .b = { + [0] = (r0 >> 0) & 0xff, + [1] = (r0 >> 8) & 0xff, + [2] = (r0 >> 16) & 0xff, + [3] = (r0 >> 24) & 0xff, + + [4] = (r1 >> 0) & 0xff, + [5] = (r1 >> 8) & 0xff, + [6] = (r1 >> 16) & 0xff, + [7] = (r1 >> 24) & 0xff, + + [8] = (r2 >> 0) & 0xff, + [9] = (r2 >> 8) & 0xff, + [10] = (r2 >> 16) & 0xff, + [11] = (r2 >> 24) & 0xff, + + [12] = (r3 >> 0) & 0xff, + [13] = (r3 >> 8) & 0xff, + [14] = (r3 >> 16) & 0xff, + [15] = (r3 >> 24) & 0xff, + }, + }; + + return uuid; +} + +static inline u32 smccc_uuid_to_reg(const uuid_t *uuid, int reg) +{ + u32 val = 0; + + val |= (u32)(uuid->b[4 * reg + 0] << 0); + val |= (u32)(uuid->b[4 * reg + 1] << 8); + val |= (u32)(uuid->b[4 * reg + 2] << 16); + val |= (u32)(uuid->b[4 * reg + 3] << 24); + + return val; +} + +#endif /* !__ASSEMBLY__ */ + /** * struct arm_smccc_res - Result from SMC/HVC call * @a0-a3 result values from registers 0 to 3 -- cgit v1.2.3 From 18a34bb5221e2b79dbcba5bb50d92beb45b68e15 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Mon, 28 Apr 2025 14:07:40 -0700 Subject: Drivers: hv: vmbus: Introduce hv_get_vmbus_root_device() The ARM64 PCI code for hyperv needs to know the VMBus root device, and it is private. Provide a function that returns it. Rename it from "hv_dev" as "hv_dev" as a symbol is very overloaded. No functional changes. Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20250428210742.435282-10-romank@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20250428210742.435282-10-romank@linux.microsoft.com> --- include/linux/hyperv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index d6ffe01962c2..23b3c3a2ed8c 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1283,6 +1283,8 @@ static inline void *hv_get_drvdata(struct hv_device *dev) return dev_get_drvdata(&dev->device); } +struct device *hv_get_vmbus_root_device(void); + struct hv_ring_buffer_debug_info { u32 current_interrupt_mask; u32 current_read_index; -- cgit v1.2.3 From ab7e531a8212394608838c31e9dfac69fafa6c8a Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Mon, 28 Apr 2025 14:07:41 -0700 Subject: ACPI: irq: Introduce acpi_get_gsi_dispatcher() Using acpi_irq_create_hierarchy() in the cases where the code also handles OF leads to code duplication as the ACPI subsystem doesn't provide means to compute the IRQ domain parent whereas the OF does. Introduce acpi_get_gsi_dispatcher() so that the drivers relying on both ACPI and OF may use irq_domain_create_hierarchy() in the common code paths. No functional changes. Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20250428210742.435282-11-romank@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20250428210742.435282-11-romank@linux.microsoft.com> --- include/linux/acpi.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3f2e93ed9730..14e8871b5787 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -335,8 +335,11 @@ int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi); +typedef struct fwnode_handle *(*acpi_gsi_domain_disp_fn)(u32); + void acpi_set_irq_model(enum acpi_irq_model_id model, - struct fwnode_handle *(*)(u32)); + acpi_gsi_domain_disp_fn fn); +acpi_gsi_domain_disp_fn acpi_get_gsi_dispatcher(void); void acpi_set_gsi_to_irq_fallback(u32 (*)(u32)); struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags, -- cgit v1.2.3 From 059717c2ba1f745f35f4bbb8991e408c12031f1e Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Fri, 23 May 2025 13:20:01 -0400 Subject: ACPI: MRRM: Fix default max memory region Per the spec, the default max memory region must be 1 covering all system memory. When platform does not provide ACPI MRRM table or when CONFIG_ACPI is opted out, the acpi_mrrm_max_mem_region() function defaults to returning 1 region complying to RDT spec. Signed-off-by: Anil S Keshavamurthy Reviewed-by: Tony Luck Link: https://patch.msgid.link/20250523172001.1761634-1-anil.s.keshavamurthy@intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c409f4cecb09..d9ba7b4fa2cd 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1098,7 +1098,7 @@ static inline acpi_handle acpi_get_processor_handle(int cpu) static inline int acpi_mrrm_max_mem_region(void) { - return -ENOENT; + return 1; } #endif /* !CONFIG_ACPI */ -- cgit v1.2.3 From 588ca944c27729c7f950d1f44c6d6700a919969a Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Wed, 21 May 2025 13:47:45 +0100 Subject: cxl/edac: Add CXL memory device memory sparing control feature Memory sparing is defined as a repair function that replaces a portion of memory with a portion of functional memory at that same DPA. The subclasses for this operation vary in terms of the scope of the sparing being performed. The cacheline sparing subclass refers to a sparing action that can replace a full cacheline. Row sparing is provided as an alternative to PPR sparing functions and its scope is that of a single DDR row. As per CXL r3.2 Table 8-125 foot note 1. Memory sparing is preferred over PPR when possible. Bank sparing allows an entire bank to be replaced. Rank sparing is defined as an operation in which an entire DDR rank is replaced. Memory sparing maintenance operations may be supported by CXL devices that implement CXL.mem protocol. A sparing maintenance operation requests the CXL device to perform a repair operation on its media. For example, a CXL device with DRAM components that support memory sparing features may implement sparing maintenance operations. The host may issue a query command by setting query resources flag in the input payload (CXL spec 3.2 Table 8-120) to determine availability of sparing resources for a given address. In response to a query request, the device shall report the resource availability by producing the memory sparing event record (CXL spec 3.2 Table 8-60) in which the Channel, Rank, Nibble Mask, Bank Group, Bank, Row, Column, Sub-Channel fields are a copy of the values specified in the request. During the execution of a sparing maintenance operation, a CXL memory device: - may not retain data - may not be able to process CXL.mem requests correctly. These CXL memory device capabilities are specified by restriction flags in the memory sparing feature readable attributes. When a CXL device identifies error on a memory component, the device may inform the host about the need for a memory sparing maintenance operation by using DRAM event record, where the 'maintenance needed' flag may set. The event record contains some of the DPA, Channel, Rank, Nibble Mask, Bank Group, Bank, Row, Column, Sub-Channel fields that should be repaired. The userspace tool requests for maintenance operation if the 'maintenance needed' flag set in the CXL DRAM error record. CXL spec 3.2 section 8.2.10.7.1.4 describes the device's memory sparing maintenance operation feature. CXL spec 3.2 section 8.2.10.7.2.3 describes the memory sparing feature discovery and configuration. Add support for controlling CXL memory device memory sparing feature. Register with EDAC driver, which gets the memory repair attr descriptors from the EDAC memory repair driver and exposes sysfs repair control attributes for memory sparing to the userspace. For example CXL memory sparing control for the CXL mem0 device is exposed in /sys/bus/edac/devices/cxl_mem0/mem_repairX/ Use case ======== 1. CXL device identifies a failure in a memory component, report to userspace in a CXL DRAM trace event with DPA and other attributes of memory to repair such as channel, rank, nibble mask, bank Group, bank, row, column, sub-channel. 2. Rasdaemon process the trace event and may issue query request in sysfs check resources available for memory sparing if either of the following conditions met. - 'maintenance needed' flag set in the event record. - 'threshold event' flag set for CVME threshold feature. - When the number of corrected error reported on a CXL.mem media to the userspace exceeds the threshold value for corrected error count defined by the userspace policy. 3. Rasdaemon process the memory sparing trace event and issue repair request for memory sparing. Kernel CXL driver shall report memory sparing event record to the userspace with the resource availability in order rasdaemon to process the event record and issue a repair request in sysfs for the memory sparing operation in the CXL device. Note: Based on the feedbacks from the community 'query' sysfs attribute is removed and reporting memory sparing error record to the userspace are not supported. Instead userspace issues sparing operation and kernel does the same to the CXL memory device, when 'maintenance needed' flag set in the DRAM event record. Add checks to ensure the memory to be repaired is offline and if online, then originates from a CXL DRAM error record reported in the current boot before requesting a memory sparing operation on the device. Note: Tested memory sparing feature control with QEMU patch "hw/cxl: Add emulation for memory sparing control feature" https://lore.kernel.org/linux-cxl/20250509172229.726-1-shiju.jose@huawei.com/T/#m5f38512a95670d75739f9dad3ee91b95c7f5c8d6 Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Signed-off-by: Shiju Jose Reviewed-by: Alison Schofield Acked-by: Dan Williams Link: https://patch.msgid.link/20250521124749.817-8-shiju.jose@huawei.com Signed-off-by: Dave Jiang --- include/linux/edac.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index 451f9c152c99..fa32f2aca22f 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -745,9 +745,16 @@ static inline int edac_ecs_get_desc(struct device *ecs_dev, #endif /* CONFIG_EDAC_ECS */ enum edac_mem_repair_type { + EDAC_REPAIR_PPR, + EDAC_REPAIR_CACHELINE_SPARING, + EDAC_REPAIR_ROW_SPARING, + EDAC_REPAIR_BANK_SPARING, + EDAC_REPAIR_RANK_SPARING, EDAC_REPAIR_MAX }; +extern const char * const edac_repair_type[]; + enum edac_mem_repair_cmd { EDAC_DO_MEM_REPAIR = 1, }; -- cgit v1.2.3 From e7d952cc39fca34386ec9f15f68cb2eaac01b5ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 24 May 2025 11:23:25 +0200 Subject: perf/headers: Clean up a bit Do a bit of readability spring cleaning: - Fix misaligned structure member in perf_addr_filter: the new struct perf_addr_filter::action member was too long, but when it was added it was not aligned properly. Align all fields to the customary column 41 alignment of most of the rest of the header. - Adjust the vertical alignment of the definition of other structures and definitions as well, so that the 'most of' in the previous paragraph changes to 'all of'. ;-) - Prettify the assignments in perf_clear_branch_entry_bitfields() - Move comments from CPP definitions to outside the macro - Move perf_guest_info_callbacks and related defines from the front of the header closer to where it's used within the header. - And more #endif markers for larger CPP blocks and standardize #if/#else/#endif blocks to the following nomenclature: #ifdef CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Standardize on consistently using the 'extern' storage class where appropriate, we had cases where method prototypes sometimes omitted the storage class: extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); Which is obviously a bit confusing and adds unnecessary noise. - s/__u64/u64 and similar cleanups: there's no point in using __u64 in non-UAPI headers, and doing so only adds unnecessary visual noise. - Harmonize all multi-parameter function prototypes along the following style: extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); - etc. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Cc: Ian Rogers Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 282 +++++++++++++++++++++++++-------------------- 1 file changed, 155 insertions(+), 127 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a96c00e2ceca..52dc7cfab0e0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -26,18 +26,9 @@ # include #endif -#define PERF_GUEST_ACTIVE 0x01 -#define PERF_GUEST_USER 0x02 - -struct perf_guest_info_callbacks { - unsigned int (*state)(void); - unsigned long (*get_ip)(void); - unsigned int (*handle_intel_pt_intr)(void); -}; - #ifdef CONFIG_HAVE_HW_BREAKPOINT -#include -#include +# include +# include #endif #include @@ -62,19 +53,20 @@ struct perf_guest_info_callbacks { #include #include #include + #include struct perf_callchain_entry { - __u64 nr; - __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ + u64 nr; + u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { - struct perf_callchain_entry *entry; - u32 max_stack; - u32 nr; - short contexts; - bool contexts_maxed; + struct perf_callchain_entry *entry; + u32 max_stack; + u32 nr; + short contexts; + bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, @@ -121,8 +113,8 @@ static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { - __u64 nr; - __u64 hw_idx; + u64 nr; + u64 hw_idx; struct perf_branch_entry entries[]; }; @@ -132,10 +124,10 @@ struct task_struct; * extra PMU register associated with an event */ struct hw_perf_event_extra { - u64 config; /* register value */ - unsigned int reg; /* register address or index */ - int alloc; /* extra register already allocated */ - int idx; /* index in shared_regs->regs[] */ + u64 config; /* register value */ + unsigned int reg; /* register address or index */ + int alloc; /* extra register already allocated */ + int idx; /* index in shared_regs->regs[] */ }; /** @@ -144,8 +136,8 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x0fffffff -#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 +#define PERF_EVENT_FLAG_ARCH 0x0fffffff +#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); @@ -227,9 +219,14 @@ struct hw_perf_event { /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ -#define PERF_HES_STOPPED 0x01 /* the counter is stopped */ -#define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ -#define PERF_HES_ARCH 0x04 + +/* the counter is stopped */ +#define PERF_HES_STOPPED 0x01 + +/* event->count up-to-date */ +#define PERF_HES_UPTODATE 0x02 + +#define PERF_HES_ARCH 0x04 int state; @@ -278,7 +275,7 @@ struct hw_perf_event { */ u64 freq_time_stamp; u64 freq_count_stamp; -#endif +#endif /* CONFIG_PERF_EVENTS */ }; struct perf_event; @@ -287,29 +284,33 @@ struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ -#define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ -#define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ + +/* txn to add/schedule event on PMU */ +#define PERF_PMU_TXN_ADD 0x1 + +/* txn to read event group from PMU */ +#define PERF_PMU_TXN_READ 0x2 /** * pmu::capabilities flags */ -#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 -#define PERF_PMU_CAP_NO_NMI 0x0002 -#define PERF_PMU_CAP_AUX_NO_SG 0x0004 -#define PERF_PMU_CAP_EXTENDED_REGS 0x0008 -#define PERF_PMU_CAP_EXCLUSIVE 0x0010 -#define PERF_PMU_CAP_ITRACE 0x0020 -#define PERF_PMU_CAP_NO_EXCLUDE 0x0040 -#define PERF_PMU_CAP_AUX_OUTPUT 0x0080 -#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 -#define PERF_PMU_CAP_AUX_PAUSE 0x0200 -#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 +#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 +#define PERF_PMU_CAP_NO_NMI 0x0002 +#define PERF_PMU_CAP_AUX_NO_SG 0x0004 +#define PERF_PMU_CAP_EXTENDED_REGS 0x0008 +#define PERF_PMU_CAP_EXCLUSIVE 0x0010 +#define PERF_PMU_CAP_ITRACE 0x0020 +#define PERF_PMU_CAP_NO_EXCLUDE 0x0040 +#define PERF_PMU_CAP_AUX_OUTPUT 0x0080 +#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 +#define PERF_PMU_CAP_AUX_PAUSE 0x0200 +#define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 /** * pmu::scope */ enum perf_pmu_scope { - PERF_PMU_SCOPE_NONE = 0, + PERF_PMU_SCOPE_NONE = 0, PERF_PMU_SCOPE_CORE, PERF_PMU_SCOPE_DIE, PERF_PMU_SCOPE_CLUSTER, @@ -393,11 +394,21 @@ struct pmu { * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ -#define PERF_EF_START 0x01 /* start the counter when adding */ -#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ -#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ -#define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ -#define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ + +/* start the counter when adding */ +#define PERF_EF_START 0x01 + +/* reload the counter when starting */ +#define PERF_EF_RELOAD 0x02 + +/* update the counter when stopping */ +#define PERF_EF_UPDATE 0x04 + +/* AUX area event, pause tracing */ +#define PERF_EF_PAUSE 0x08 + +/* AUX area event, resume tracing */ +#define PERF_EF_RESUME 0x10 /* * Adds/Removes a counter to/from the PMU, can be done inside a @@ -596,10 +607,10 @@ enum perf_addr_filter_action_t { * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { - struct list_head entry; - struct path path; - unsigned long offset; - unsigned long size; + struct list_head entry; + struct path path; + unsigned long offset; + unsigned long size; enum perf_addr_filter_action_t action; }; @@ -614,14 +625,14 @@ struct perf_addr_filter { * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { - struct list_head list; - raw_spinlock_t lock; - unsigned int nr_file_filters; + struct list_head list; + raw_spinlock_t lock; + unsigned int nr_file_filters; }; struct perf_addr_filter_range { - unsigned long start; - unsigned long size; + unsigned long start; + unsigned long size; }; /** @@ -669,24 +680,24 @@ struct swevent_hlist { struct rcu_head rcu_head; }; -#define PERF_ATTACH_CONTEXT 0x0001 -#define PERF_ATTACH_GROUP 0x0002 -#define PERF_ATTACH_TASK 0x0004 -#define PERF_ATTACH_TASK_DATA 0x0008 -#define PERF_ATTACH_GLOBAL_DATA 0x0010 -#define PERF_ATTACH_SCHED_CB 0x0020 -#define PERF_ATTACH_CHILD 0x0040 -#define PERF_ATTACH_EXCLUSIVE 0x0080 -#define PERF_ATTACH_CALLCHAIN 0x0100 -#define PERF_ATTACH_ITRACE 0x0200 +#define PERF_ATTACH_CONTEXT 0x0001 +#define PERF_ATTACH_GROUP 0x0002 +#define PERF_ATTACH_TASK 0x0004 +#define PERF_ATTACH_TASK_DATA 0x0008 +#define PERF_ATTACH_GLOBAL_DATA 0x0010 +#define PERF_ATTACH_SCHED_CB 0x0020 +#define PERF_ATTACH_CHILD 0x0040 +#define PERF_ATTACH_EXCLUSIVE 0x0080 +#define PERF_ATTACH_CALLCHAIN 0x0100 +#define PERF_ATTACH_ITRACE 0x0200 struct bpf_prog; struct perf_cgroup; struct perf_buffer; struct pmu_event_list { - raw_spinlock_t lock; - struct list_head list; + raw_spinlock_t lock; + struct list_head list; }; /* @@ -696,12 +707,12 @@ struct pmu_event_list { * disabled is sufficient since it will hold-off the IPIs. */ #ifdef CONFIG_PROVE_LOCKING -#define lockdep_assert_event_ctx(event) \ +# define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else -#define lockdep_assert_event_ctx(event) +# define lockdep_assert_event_ctx(event) #endif #define for_each_sibling_event(sibling, event) \ @@ -859,9 +870,9 @@ struct perf_event { #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; -#ifdef CONFIG_FUNCTION_TRACER +# ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; -#endif +# endif #endif #ifdef CONFIG_CGROUP_PERF @@ -880,7 +891,7 @@ struct perf_event { * of it. event->orig_type contains original 'type' requested by * user. */ - __u32 orig_type; + u32 orig_type; #endif /* CONFIG_PERF_EVENTS */ }; @@ -945,8 +956,8 @@ static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) } struct perf_event_groups { - struct rb_root tree; - u64 index; + struct rb_root tree; + u64 index; }; @@ -1189,16 +1200,18 @@ extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); + extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, - int cpu, - struct task_struct *task, - perf_overflow_handler_t callback, - void *context); + int cpu, + struct task_struct *task, + perf_overflow_handler_t callback, + void *context); + extern void perf_pmu_migrate_context(struct pmu *pmu, - int src_cpu, int dst_cpu); -int perf_event_read_local(struct perf_event *event, u64 *value, - u64 *enabled, u64 *running); + int src_cpu, int dst_cpu); +extern int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -1415,14 +1428,14 @@ static inline u32 perf_sample_data_size(struct perf_sample_data *data, */ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) { - br->mispred = 0; - br->predicted = 0; - br->in_tx = 0; - br->abort = 0; - br->cycles = 0; - br->type = 0; - br->spec = PERF_BR_SPEC_NA; - br->reserved = 0; + br->mispred = 0; + br->predicted = 0; + br->in_tx = 0; + br->abort = 0; + br->cycles = 0; + br->type = 0; + br->spec = PERF_BR_SPEC_NA; + br->reserved = 0; } extern void perf_output_sample(struct perf_output_handle *handle, @@ -1611,7 +1624,17 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); +#define PERF_GUEST_ACTIVE 0x01 +#define PERF_GUEST_USER 0x02 + +struct perf_guest_info_callbacks { + unsigned int (*state)(void); + unsigned long (*get_ip)(void); + unsigned int (*handle_intel_pt_intr)(void); +}; + #ifdef CONFIG_GUEST_PERF_EVENTS + extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); @@ -1622,21 +1645,27 @@ static inline unsigned int perf_guest_state(void) { return static_call(__perf_guest_state)(); } + static inline unsigned long perf_guest_get_ip(void) { return static_call(__perf_guest_get_ip)(); } + static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } + extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); -#else + +#else /* !CONFIG_GUEST_PERF_EVENTS: */ + static inline unsigned int perf_guest_state(void) { return 0; } static inline unsigned long perf_guest_get_ip(void) { return 0; } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } -#endif /* CONFIG_GUEST_PERF_EVENTS */ + +#endif /* !CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); @@ -1666,6 +1695,7 @@ static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx * { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; + entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; @@ -1679,6 +1709,7 @@ static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; + entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; @@ -1705,7 +1736,7 @@ static inline int perf_is_paranoid(void) return sysctl_perf_event_paranoid > -1; } -int perf_allow_kernel(void); +extern int perf_allow_kernel(void); static inline int perf_allow_cpu(void) { @@ -1827,7 +1858,7 @@ extern int perf_output_begin_backward(struct perf_output_handle *handle, extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len); + const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, @@ -1844,7 +1875,9 @@ extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); + #else /* !CONFIG_PERF_EVENTS: */ + static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } @@ -1922,19 +1955,14 @@ static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } -static inline int perf_event_period(struct perf_event *event, u64 value) -{ - return -EINVAL; -} -static inline u64 perf_event_pause(struct perf_event *event, bool reset) -{ - return 0; -} -static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) -{ - return 0; -} -#endif +static inline int +perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } +static inline u64 +perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int +perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { return 0; } + +#endif /* !CONFIG_PERF_EVENTS */ #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); @@ -1942,31 +1970,31 @@ extern void perf_restore_debug_store(void); static inline void perf_restore_debug_store(void) { } #endif -#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) +#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { - struct device_attribute attr; - u64 id; - const char *event_str; + struct device_attribute attr; + u64 id; + const char *event_str; }; struct perf_pmu_events_ht_attr { - struct device_attribute attr; - u64 id; - const char *event_str_ht; - const char *event_str_noht; + struct device_attribute attr; + u64 id; + const char *event_str_ht; + const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { - struct device_attribute attr; - u64 id; - const char *event_str; - u64 pmu_type; + struct device_attribute attr; + u64 id; + const char *event_str; + u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { - struct device_attribute attr; - u64 pmu_type; + struct device_attribute attr; + u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, @@ -2008,11 +2036,11 @@ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS -int perf_event_init_cpu(unsigned int cpu); -int perf_event_exit_cpu(unsigned int cpu); +extern int perf_event_init_cpu(unsigned int cpu); +extern int perf_event_exit_cpu(unsigned int cpu); #else -#define perf_event_init_cpu NULL -#define perf_event_exit_cpu NULL +# define perf_event_init_cpu NULL +# define perf_event_exit_cpu NULL #endif extern void arch_perf_update_userpage(struct perf_event *event, -- cgit v1.2.3 From 12ca42c237756182aad8ab04654c952765cb9061 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 16 May 2025 17:07:39 -0700 Subject: alloc_tag: allocate percpu counters for module tags dynamically When a module gets unloaded it checks whether any of its tags are still in use and if so, we keep the memory containing module's allocation tags alive until all tags are unused. However percpu counters referenced by the tags are freed by free_module(). This will lead to UAF if the memory allocated by a module is accessed after module was unloaded. To fix this we allocate percpu counters for module allocation tags dynamically and we keep it alive for tags which are still in use after module unloading. This also removes the requirement of a larger PERCPU_MODULE_RESERVE when memory allocation profiling is enabled because percpu memory for counters does not need to be reserved anymore. Link: https://lkml.kernel.org/r/20250517000739.5930-1-surenb@google.com Fixes: 0db6f8d7820a ("alloc_tag: load module tags into separate contiguous memory") Signed-off-by: Suren Baghdasaryan Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/all/20250516131246.6244-1-00107082@163.com/ Tested-by: David Wang <00107082@163.com> Cc: Christoph Lameter (Ampere) Cc: Dennis Zhou Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 12 ++++++++++++ include/linux/codetag.h | 8 ++++---- include/linux/percpu.h | 4 ---- 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index a946e0203e6d..8f7931eb7d16 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -104,6 +104,16 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #else /* ARCH_NEEDS_WEAK_PER_CPU */ +#ifdef MODULE + +#define DEFINE_ALLOC_TAG(_alloc_tag) \ + static struct alloc_tag _alloc_tag __used __aligned(8) \ + __section(ALLOC_TAG_SECTION_NAME) = { \ + .ct = CODE_TAG_INIT, \ + .counters = NULL }; + +#else /* MODULE */ + #define DEFINE_ALLOC_TAG(_alloc_tag) \ static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ static struct alloc_tag _alloc_tag __used __aligned(8) \ @@ -111,6 +121,8 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); .ct = CODE_TAG_INIT, \ .counters = &_alloc_tag_cntr }; +#endif /* MODULE */ + #endif /* ARCH_NEEDS_WEAK_PER_CPU */ DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, diff --git a/include/linux/codetag.h b/include/linux/codetag.h index d14dbd26b370..0ee4c21c6dbc 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -36,10 +36,10 @@ union codetag_ref { struct codetag_type_desc { const char *section; size_t tag_size; - void (*module_load)(struct codetag_type *cttype, - struct codetag_module *cmod); - void (*module_unload)(struct codetag_type *cttype, - struct codetag_module *cmod); + void (*module_load)(struct module *mod, + struct codetag *start, struct codetag *end); + void (*module_unload)(struct module *mod, + struct codetag *start, struct codetag *end); #ifdef CONFIG_MODULES void (*module_replaced)(struct module *mod, struct module *new_mod); bool (*needs_section_mem)(struct module *mod, unsigned long size); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 52b5ea663b9f..85bf8dd9f087 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -15,11 +15,7 @@ /* enough to cover all DEFINE_PER_CPUs in modules */ #ifdef CONFIG_MODULES -#ifdef CONFIG_MEM_ALLOC_PROFILING -#define PERCPU_MODULE_RESERVE (8 << 13) -#else #define PERCPU_MODULE_RESERVE (8 << 10) -#endif #else #define PERCPU_MODULE_RESERVE 0 #endif -- cgit v1.2.3 From ee40c9920ac286c5bfe7c811e66ff899266d2582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Ca=C3=B1uelo=20Navarro?= Date: Fri, 23 May 2025 14:19:10 +0200 Subject: mm: fix copy_vma() error handling for hugetlb mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If, during a mremap() operation for a hugetlb-backed memory mapping, copy_vma() fails after the source vma has been duplicated and opened (ie. vma_link() fails), the error is handled by closing the new vma. This updates the hugetlbfs reservation counter of the reservation map which at this point is referenced by both the source vma and the new copy. As a result, once the new vma has been freed and copy_vma() returns, the reservation counter for the source vma will be incorrect. This patch addresses this corner case by clearing the hugetlb private page reservation reference for the new vma and decrementing the reference before closing the vma, so that vma_close() won't update the reservation counter. This is also what copy_vma_and_data() does with the source vma if copy_vma() succeeds, so a helper function has been added to do the fixup in both functions. The issue was reported by a private syzbot instance and can be reproduced using the C reproducer in [1]. It's also a possible duplicate of public syzbot report [2]. The WARNING report is: ============================================================ page_counter underflow: -1024 nr_pages=1024 WARNING: CPU: 0 PID: 3287 at mm/page_counter.c:61 page_counter_cancel+0xf6/0x120 Modules linked in: CPU: 0 UID: 0 PID: 3287 Comm: repro__WARNING_ Not tainted 6.15.0-rc7+ #54 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-2-gc13ff2cd-prebuilt.qemu.org 04/01/2014 RIP: 0010:page_counter_cancel+0xf6/0x120 Code: ff 5b 41 5e 41 5f 5d c3 cc cc cc cc e8 f3 4f 8f ff c6 05 64 01 27 06 01 48 c7 c7 60 15 f8 85 48 89 de 4c 89 fa e8 2a a7 51 ff <0f> 0b e9 66 ff ff ff 44 89 f9 80 e1 07 38 c1 7c 9d 4c 81 RSP: 0018:ffffc900025df6a0 EFLAGS: 00010246 RAX: 2edfc409ebb44e00 RBX: fffffffffffffc00 RCX: ffff8880155f0000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff81c4a23c R09: 1ffff1100330482a R10: dffffc0000000000 R11: ffffed100330482b R12: 0000000000000000 R13: ffff888058a882c0 R14: ffff888058a882c0 R15: 0000000000000400 FS: 0000000000000000(0000) GS:ffff88808fc53000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004b33e0 CR3: 00000000076d6000 CR4: 00000000000006f0 Call Trace: page_counter_uncharge+0x33/0x80 hugetlb_cgroup_uncharge_counter+0xcb/0x120 hugetlb_vm_op_close+0x579/0x960 ? __pfx_hugetlb_vm_op_close+0x10/0x10 remove_vma+0x88/0x130 exit_mmap+0x71e/0xe00 ? __pfx_exit_mmap+0x10/0x10 ? __mutex_unlock_slowpath+0x22e/0x7f0 ? __pfx_exit_aio+0x10/0x10 ? __up_read+0x256/0x690 ? uprobe_clear_state+0x274/0x290 ? mm_update_next_owner+0xa9/0x810 __mmput+0xc9/0x370 exit_mm+0x203/0x2f0 ? __pfx_exit_mm+0x10/0x10 ? taskstats_exit+0x32b/0xa60 do_exit+0x921/0x2740 ? do_raw_spin_lock+0x155/0x3b0 ? __pfx_do_exit+0x10/0x10 ? __pfx_do_raw_spin_lock+0x10/0x10 ? _raw_spin_lock_irq+0xc5/0x100 do_group_exit+0x20c/0x2c0 get_signal+0x168c/0x1720 ? __pfx_get_signal+0x10/0x10 ? schedule+0x165/0x360 arch_do_signal_or_restart+0x8e/0x7d0 ? __pfx_arch_do_signal_or_restart+0x10/0x10 ? __pfx___se_sys_futex+0x10/0x10 syscall_exit_to_user_mode+0xb8/0x2c0 do_syscall_64+0x75/0x120 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x422dcd Code: Unable to access opcode bytes at 0x422da3. RSP: 002b:00007ff266cdb208 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: 0000000000000001 RBX: 00007ff266cdbcdc RCX: 0000000000422dcd RDX: 00000000000f4240 RSI: 0000000000000081 RDI: 00000000004c7bec RBP: 00007ff266cdb220 R08: 203a6362696c6720 R09: 203a6362696c6720 R10: 0000200000c00000 R11: 0000000000000246 R12: ffffffffffffffd0 R13: 0000000000000002 R14: 00007ffe1cb5f520 R15: 00007ff266cbb000 ============================================================ Link: https://lkml.kernel.org/r/20250523-warning_in_page_counter_cancel-v2-1-b6df1a8cfefd@igalia.com Link: https://people.igalia.com/rcn/kernel_logs/20250422__WARNING_in_page_counter_cancel__repro.c [1] Link: https://lore.kernel.org/all/67000a50.050a0220.49194.048d.GAE@google.com/ [2] Signed-off-by: Ricardo Cañuelo Navarro Suggested-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Florent Revest Cc: Jann Horn Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8f3ac832ee7f..4861a7e304bb 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -275,6 +275,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, bool is_hugetlb_entry_migration(pte_t pte); bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +void fixup_hugetlb_reservations(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -468,6 +469,10 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } +static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) +{ +} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write -- cgit v1.2.3 From 707f853d7fa3ce323a6875487890c213e34d81a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 May 2025 16:12:09 +0200 Subject: module: Provide EXPORT_SYMBOL_GPL_FOR_MODULES() helper Helper macro to more easily limit the export of a symbol to a given list of modules. Eg: EXPORT_SYMBOL_GPL_FOR_MODULES(preempt_notifier_inc, "kvm"); will limit the use of said function to kvm.ko, any other module trying to use this symbol will refure to load (and get modpost build failures). Requested-by: Masahiro Yamada Requested-by: Christoph Hellwig Signed-off-by: Peter Zijlstra Reviewed-by: Petr Pavlu Signed-off-by: Masahiro Yamada --- include/linux/export.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/export.h b/include/linux/export.h index a8c23d945634..f35d03b4113b 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -24,11 +24,17 @@ .long sym #endif -#define ___EXPORT_SYMBOL(sym, license, ns) \ +/* + * LLVM integrated assembler cam merge adjacent string literals (like + * C and GNU-as) passed to '.ascii', but not to '.asciz' and chokes on: + * + * .asciz "MODULE_" "kvm" ; + */ +#define ___EXPORT_SYMBOL(sym, license, ns...) \ .section ".export_symbol","a" ASM_NL \ __export_symbol_##sym: ASM_NL \ .asciz license ASM_NL \ - .asciz ns ASM_NL \ + .ascii ns "\0" ASM_NL \ __EXPORT_SYMBOL_REF(sym) ASM_NL \ .previous @@ -85,4 +91,6 @@ #define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL(sym, "", ns) #define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL(sym, "GPL", ns) +#define EXPORT_SYMBOL_GPL_FOR_MODULES(sym, mods) __EXPORT_SYMBOL(sym, "GPL", "module:" mods) + #endif /* _LINUX_EXPORT_H */ -- cgit v1.2.3 From 478ad02d6844217cc7568619aeb0809d93ade43d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 May 2025 15:43:36 -0700 Subject: Disable FOP_DONTCACHE for now due to bugs This is kind of last-minute, but Al Viro reported that the new FOP_DONTCACHE flag causes memory corruption due to use-after-free issues. This was triggered by commit 974c5e6139db ("xfs: flag as supporting FOP_DONTCACHE"), but that is not the underlying bug - it is just the first user of the flag. Vlastimil Babka suspects the underlying problem stems from the folio_end_writeback() logic introduced in commit fb7d3bc414939 ("mm/filemap: drop streaming/uncached pages when writeback completes"). The most straightforward fix would be to just revert the commit that exposed this, but Matthew Wilcox points out that other filesystems are also starting to enable the FOP_DONTCACHE logic, so this instead disables that bit globally for now. The fix will hopefully end up being trivial and we can just re-enable this logic after more testing, but until such a time we'll have to disable the new FOP_DONTCACHE flag. Reported-by: Al Viro Link: https://lore.kernel.org/all/20250525083209.GS2023217@ZenIV/ Triggered-by: 974c5e6139db ("xfs: flag as supporting FOP_DONTCACHE") Cc: Vlastimil Babka Cc: Matthew Wilcox Cc: Jan Kara Cc: Jens Axboe Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Christian Brauner Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..a4fd649e2c3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2186,7 +2186,7 @@ struct file_operations { /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) /* File system supports uncached read/write buffered IO */ -#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) +#define FOP_DONTCACHE 0 /* ((__force fop_flags_t)(1 << 7)) */ /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, -- cgit v1.2.3 From 384492c48e6a88c9a7f0376d8e8ac7f557988e92 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 20 May 2025 13:30:42 -0700 Subject: net: devmem: support single IOV with sendmsg sendmsg() with a single iov becomes ITER_UBUF, sendmsg() with multiple iovs becomes ITER_IOVEC. iter_iov_len does not return correct value for UBUF, so teach to treat UBUF differently. Cc: Al Viro Cc: Pavel Begunkov Cc: Mina Almasry Fixes: bd61848900bf ("net: devmem: Implement TX path") Signed-off-by: Stanislav Fomichev Acked-by: Mina Almasry Reviewed-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/uio.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 49ece9e1888f..393d0622cc28 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -99,7 +99,13 @@ static inline const struct iovec *iter_iov(const struct iov_iter *iter) } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) -#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset) + +static inline size_t iter_iov_len(const struct iov_iter *i) +{ + if (i->iter_type == ITER_UBUF) + return i->count; + return iter_iov(i)->iov_len - i->iov_offset; +} static inline enum iter_type iov_iter_type(const struct iov_iter *i) { -- cgit v1.2.3 From 45ca7e9f0730ae36fc610e675b990e9cc9ca0714 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 21 May 2025 14:17:05 +0200 Subject: vsock/virtio: fix `rx_bytes` accounting for stream sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In `struct virtio_vsock_sock`, we maintain two counters: - `rx_bytes`: used internally to track how many bytes have been read. This supports mechanisms like .stream_has_data() and sock_rcvlowat(). - `fwd_cnt`: used for the credit mechanism to inform available receive buffer space to the remote peer. These counters are updated via virtio_transport_inc_rx_pkt() and virtio_transport_dec_rx_pkt(). Since the beginning with commit 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko"), we call virtio_transport_dec_rx_pkt() in virtio_transport_stream_do_dequeue() only when we consume the entire packet, so partial reads, do not update `rx_bytes` and `fwd_cnt`. This is fine for `fwd_cnt`, because we still have space used for the entire packet, and we don't want to update the credit for the other peer until we free the space of the entire packet. However, this causes `rx_bytes` to be stale on partial reads. Previously, this didn’t cause issues because `rx_bytes` was used only by .stream_has_data(), and any unread portion of a packet implied data was still available. However, since commit 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages"), we now rely on `rx_bytes` to determine if a credit update should be sent when the data in the RX queue drops below SO_RCVLOWAT value. This patch fixes the accounting by updating `rx_bytes` with the number of bytes actually read, even on partial reads, while leaving `fwd_cnt` untouched until the packet is fully consumed. Also introduce a new `buf_used` counter to check that the remote peer is honoring the given credit; this was previously done via `rx_bytes`. Fixes: 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages") Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20250521121705.196379-1-sgarzare@redhat.com Signed-off-by: Paolo Abeni --- include/linux/virtio_vsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 0387d64e2c66..36fb3edfa403 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -140,6 +140,7 @@ struct virtio_vsock_sock { u32 last_fwd_cnt; u32 rx_bytes; u32 buf_alloc; + u32 buf_used; struct sk_buff_head rx_queue; u32 msg_count; }; -- cgit v1.2.3 From 9be022476fea62b32aa05d9d370e0175155731e6 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 11 Apr 2025 21:14:12 +0800 Subject: mailbox: Remove devm_mbox_controller_unregister Commit e898d9cdd3a9("mailbox: Add device-managed registration functions") introduced device-managed API for mailbox, but in the past 7 years, there is no user for devm_mbox_controller_unregister. So remove it. Signed-off-by: Peng Fan Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index 5fb0b65f45a2..ad01c4082358 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -134,7 +134,4 @@ void mbox_chan_txdone(struct mbox_chan *chan, int r); /* atomic */ int devm_mbox_controller_register(struct device *dev, struct mbox_controller *mbox); -void devm_mbox_controller_unregister(struct device *dev, - struct mbox_controller *mbox); - #endif /* __MAILBOX_CONTROLLER_H */ -- cgit v1.2.3 From ed449ddbd867f2cc02d6890c231431f264a876eb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:09 -0700 Subject: net: core: Convert inet_addr_is_any() to sockaddr_storage All the callers of inet_addr_is_any() have a sockaddr_storage-backed sockaddr. Avoid casts and switch prototype to the actual object being used. Reviewed-by: Kuniyuki Iwashima Reviewed-by: Martin K. Petersen # SCSI Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250521204619.2301870-1-kees@kernel.org Signed-off-by: Paolo Abeni --- include/linux/inet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/inet.h b/include/linux/inet.h index bd8276e96e60..9158772f3559 100644 --- a/include/linux/inet.h +++ b/include/linux/inet.h @@ -55,6 +55,6 @@ extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char extern int inet_pton_with_scope(struct net *net, unsigned short af, const char *src, const char *port, struct sockaddr_storage *addr); -extern bool inet_addr_is_any(struct sockaddr *addr); +bool inet_addr_is_any(struct sockaddr_storage *addr); #endif /* _LINUX_INET_H */ -- cgit v1.2.3 From 161972650d6795ea00f8b72557cf3c3e593ed250 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:10 -0700 Subject: net: core: Switch netif_set_mac_address() to struct sockaddr_storage In order to avoid passing around struct sockaddr that has a size the compiler cannot reason about (nor track at runtime), convert netif_set_mac_address() to take struct sockaddr_storage. This is just a cast conversion, so there is are no binary changes. Following patches will make actual allocation changes. Acked-by: Gustavo A. R. Silva Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250521204619.2301870-2-kees@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ea9d335de130..47200a394a02 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4212,7 +4212,7 @@ int netif_set_mtu(struct net_device *dev, int new_mtu); int dev_set_mtu(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); -int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 9ca6804ab7c34f65fcf2e29333a39e7807c30b60 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:14 -0700 Subject: net: core: Convert dev_set_mac_address() to struct sockaddr_storage All users of dev_set_mac_address() are now using a struct sockaddr_storage. Convert the internal data type to struct sockaddr_storage, drop the casts, and update pointer types. Acked-by: Gustavo A. R. Silva Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250521204619.2301870-6-kees@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 47200a394a02..b4242b997373 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4214,7 +4214,7 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); -- cgit v1.2.3 From ae9fcd5a0f8ab7e12619e1c66312a03b842935c3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:16 -0700 Subject: net: core: Convert dev_set_mac_address_user() to use struct sockaddr_storage Convert callers of dev_set_mac_address_user() to use struct sockaddr_storage. Add sanity checks on dev->addr_len usage. Signed-off-by: Kees Cook Acked-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20250521204619.2301870-8-kees@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4242b997373..adb14db25798 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4216,7 +4216,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, -- cgit v1.2.3 From c5b6ababd21ab6bb251e5a2b4db110e76fb00a26 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 12 May 2025 14:04:02 -0400 Subject: locking/mutex: implement mutex_trylock_nested Despite the fact that several lockdep-related checks are skipped when calling trylock* versions of the locking primitives, for example mutex_trylock, each time the mutex is acquired, a held_lock is still placed onto the lockdep stack by __lock_acquire() which is called regardless of whether the trylock* or regular locking API was used. This means that if the caller successfully acquires more than MAX_LOCK_DEPTH locks of the same class, even when using mutex_trylock, lockdep will still complain that the maximum depth of the held lock stack has been reached and disable itself. For example, the following error currently occurs in the ARM version of KVM, once the code tries to lock all vCPUs of a VM configured with more than MAX_LOCK_DEPTH vCPUs, a situation that can easily happen on modern systems, where having more than 48 CPUs is common, and it's also common to run VMs that have vCPU counts approaching that number: [ 328.171264] BUG: MAX_LOCK_DEPTH too low! [ 328.175227] turning off the locking correctness validator. [ 328.180726] Please attach the output of /proc/lock_stat to the bug report [ 328.187531] depth: 48 max: 48! [ 328.190678] 48 locks held by qemu-kvm/11664: [ 328.194957] #0: ffff800086de5ba0 (&kvm->lock){+.+.}-{3:3}, at: kvm_ioctl_create_device+0x174/0x5b0 [ 328.204048] #1: ffff0800e78800b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.212521] #2: ffff07ffeee51e98 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.220991] #3: ffff0800dc7d80b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.229463] #4: ffff07ffe0c980b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.237934] #5: ffff0800a3883c78 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 [ 328.246405] #6: ffff07fffbe480b8 (&vcpu->mutex){+.+.}-{3:3}, at: lock_all_vcpus+0x16c/0x2a0 Luckily, in all instances that require locking all vCPUs, the 'kvm->lock' is taken a priori, and that fact makes it possible to use the little known feature of lockdep, called a 'nest_lock', to avoid this warning and subsequent lockdep self-disablement. The action of 'nested lock' being provided to lockdep's lock_acquire(), causes the lockdep to detect that the top of the held lock stack contains a lock of the same class and then increment its reference counter instead of pushing a new held_lock item onto that stack. See __lock_acquire for more information. Signed-off-by: Maxim Levitsky Acked-by: Peter Zijlstra (Intel) Message-ID: <20250512180407.659015-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/mutex.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 2143d05116be..da4518cfd59c 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -193,7 +193,22 @@ extern void mutex_lock_io(struct mutex *lock); * * Returns 1 if the mutex has been acquired successfully, and 0 on contention. */ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern int _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); + +#define mutex_trylock_nest_lock(lock, nest_lock) \ +( \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map), \ + _mutex_trylock_nest_lock(lock, &(nest_lock)->dep_map) \ +) + +#define mutex_trylock(lock) _mutex_trylock_nest_lock(lock, NULL) +#else extern int mutex_trylock(struct mutex *lock); +#define mutex_trylock_nest_lock(lock, nest_lock) mutex_trylock(lock) +#endif + extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); -- cgit v1.2.3 From fb49f07ba1d9d8c9bd8854878ae6b5b21ff9ac45 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 12 May 2025 14:04:03 -0400 Subject: locking/mutex: implement mutex_lock_killable_nest_lock KVM's SEV intra-host migration code needs to lock all vCPUs of the source and the target VM, before it proceeds with the migration. The number of vCPUs that belong to each VM is not bounded by anything except a self-imposed KVM limit of CONFIG_KVM_MAX_NR_VCPUS vCPUs which is significantly larger than the depth of lockdep's lock stack. Luckily, the locks in both of the cases mentioned above, are held under the 'kvm->lock' of each VM, which means that we can use the little known lockdep feature called a "nest_lock" to support this use case in a cleaner way, compared to the way it's currently done. Implement and expose 'mutex_lock_killable_nest_lock' for this purpose. Signed-off-by: Maxim Levitsky Acked-by: Peter Zijlstra (Intel) Message-ID: <20250512180407.659015-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/mutex.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index da4518cfd59c..a039fa8c1780 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -156,16 +156,15 @@ static inline int __devm_mutex_init(struct device *dev, struct mutex *lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); - extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); -extern int __must_check mutex_lock_killable_nested(struct mutex *lock, - unsigned int subclass); +extern int __must_check _mutex_lock_killable(struct mutex *lock, + unsigned int subclass, struct lockdep_map *nest_lock); extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass); #define mutex_lock(lock) mutex_lock_nested(lock, 0) #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) -#define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0) +#define mutex_lock_killable(lock) _mutex_lock_killable(lock, 0, NULL) #define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0) #define mutex_lock_nest_lock(lock, nest_lock) \ @@ -174,6 +173,15 @@ do { \ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) +#define mutex_lock_killable_nest_lock(lock, nest_lock) \ +( \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map), \ + _mutex_lock_killable(lock, 0, &(nest_lock)->dep_map) \ +) + +#define mutex_lock_killable_nested(lock, subclass) \ + _mutex_lock_killable(lock, subclass, NULL) + #else extern void mutex_lock(struct mutex *lock); extern int __must_check mutex_lock_interruptible(struct mutex *lock); @@ -183,6 +191,7 @@ extern void mutex_lock_io(struct mutex *lock); # define mutex_lock_nested(lock, subclass) mutex_lock(lock) # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) +# define mutex_lock_killable_nest_lock(lock, nest_lock) mutex_lock_killable(lock) # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) # define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock) #endif -- cgit v1.2.3 From e4a454ced74c0ac97c8bd32f086ee3ad74528780 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 12 May 2025 14:04:04 -0400 Subject: KVM: add kvm_lock_all_vcpus and kvm_trylock_all_vcpus In a few cases, usually in the initialization code, KVM locks all vCPUs of a VM to ensure that userspace doesn't do funny things while KVM performs an operation that affects the whole VM. Until now, all these operations were implemented using custom code, and all of them share the same problem: Lockdep can't cope with simultaneous locking of a large number of locks of the same class. However if these locks are taken while another lock is already held, which is luckily the case, it is possible to take advantage of little known _nest_lock feature of lockdep which allows in this case to have an unlimited number of locks of same class to be taken. To implement this, create two functions: kvm_lock_all_vcpus() and kvm_trylock_all_vcpus() Both functions are needed because some code that will be replaced in the subsequent patches, uses mutex_trylock, instead of regular mutex_lock. Suggested-by: Paolo Bonzini Signed-off-by: Maxim Levitsky Acked-by: Marc Zyngier Acked-by: Peter Zijlstra (Intel) Message-ID: <20250512180407.659015-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1dedc421b3e3..a6140415c693 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1015,6 +1015,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) void kvm_destroy_vcpus(struct kvm *kvm); +int kvm_trylock_all_vcpus(struct kvm *kvm); +int kvm_lock_all_vcpus(struct kvm *kvm); +void kvm_unlock_all_vcpus(struct kvm *kvm); + void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 89f9dba365e1b85caef556d28654c6d336629eef Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Thu, 22 May 2025 23:04:25 +0000 Subject: dma-buf: Rename debugfs symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the debugfs list and mutex so it's clear they are now usable without the need for CONFIG_DEBUG_FS. The list will always be populated to support the creation of a BPF iterator for dmabufs. Signed-off-by: T.J. Mercier Reviewed-by: Christian König Acked-by: Song Liu Link: https://lore.kernel.org/r/20250522230429.941193-2-tjmercier@google.com Signed-off-by: Alexei Starovoitov --- include/linux/dma-buf.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 36216d28d8bd..8ff4add71f88 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -370,10 +370,8 @@ struct dma_buf { */ struct module *owner; -#if IS_ENABLED(CONFIG_DEBUG_FS) /** @list_node: node for dma_buf accounting and debugging. */ struct list_head list_node; -#endif /** @priv: exporter specific private data for this buffer object. */ void *priv; -- cgit v1.2.3 From 76ea95534995adde1aa3cb1aa97ef33f50a617a9 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Thu, 22 May 2025 23:04:26 +0000 Subject: bpf: Add dmabuf iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dmabuf iterator traverses the list of all DMA buffers. DMA buffers are refcounted through their associated struct file. A reference is taken on each buffer as the list is iterated to ensure each buffer persists for the duration of the bpf program execution without holding the list mutex. Signed-off-by: T.J. Mercier Reviewed-by: Christian König Acked-by: Song Liu Link: https://lore.kernel.org/r/20250522230429.941193-3-tjmercier@google.com Signed-off-by: Alexei Starovoitov --- include/linux/dma-buf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 8ff4add71f88..7af2ea839f58 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -634,4 +634,6 @@ int dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map); void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map); int dma_buf_vmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map); void dma_buf_vunmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map); +struct dma_buf *dma_buf_iter_begin(void); +struct dma_buf *dma_buf_iter_next(struct dma_buf *dmbuf); #endif /* __DMA_BUF_H__ */ -- cgit v1.2.3 From 7b2b67dbd449afd00fc7279b1ab7ffa3d26fe0c9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 27 May 2025 07:28:54 -0600 Subject: Revert "Disable FOP_DONTCACHE for now due to bugs" This reverts commit 478ad02d6844217cc7568619aeb0809d93ade43d. Both the read and write side dirty && writeback races should be resolved now, revert the commit that disabled FOP_DONTCACHE for filesystems. Link: https://lore.kernel.org/linux-fsdevel/20250525083209.GS2023217@ZenIV/ Signed-off-by: Jens Axboe Link: https://lore.kernel.org/20250527133255.452431-4-axboe@kernel.dk Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0db87f8e676c..57c3db3ef6ad 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2207,7 +2207,7 @@ struct file_operations { /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) /* File system supports uncached read/write buffered IO */ -#define FOP_DONTCACHE 0 /* ((__force fop_flags_t)(1 << 7)) */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, -- cgit v1.2.3 From e2d2115e56c4a02377189bfc3a9a7933552a7b0f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 23 May 2025 21:13:35 -0700 Subject: bpf: Do not include stack ptr register in precision backtracking bookkeeping Yi Lai reported an issue ([1]) where the following warning appears in kernel dmesg: [ 60.643604] verifier backtracking bug [ 60.643635] WARNING: CPU: 10 PID: 2315 at kernel/bpf/verifier.c:4302 __mark_chain_precision+0x3a6c/0x3e10 [ 60.648428] Modules linked in: bpf_testmod(OE) [ 60.650471] CPU: 10 UID: 0 PID: 2315 Comm: test_progs Tainted: G OE 6.15.0-rc4-gef11287f8289-dirty #327 PREEMPT(full) [ 60.654385] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [ 60.656682] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 60.660475] RIP: 0010:__mark_chain_precision+0x3a6c/0x3e10 [ 60.662814] Code: 5a 30 84 89 ea e8 c4 d9 01 00 80 3d 3e 7d d8 04 00 0f 85 60 fa ff ff c6 05 31 7d d8 04 01 48 c7 c7 00 58 30 84 e8 c4 06 a5 ff <0f> 0b e9 46 fa ff ff 48 ... [ 60.668720] RSP: 0018:ffff888116cc7298 EFLAGS: 00010246 [ 60.671075] RAX: 54d70e82dfd31900 RBX: ffff888115b65e20 RCX: 0000000000000000 [ 60.673659] RDX: 0000000000000001 RSI: 0000000000000004 RDI: 00000000ffffffff [ 60.676241] RBP: 0000000000000400 R08: ffff8881f6f23bd3 R09: 1ffff1103ede477a [ 60.678787] R10: dffffc0000000000 R11: ffffed103ede477b R12: ffff888115b60ae8 [ 60.681420] R13: 1ffff11022b6cbc4 R14: 00000000fffffff2 R15: 0000000000000001 [ 60.684030] FS: 00007fc2aedd80c0(0000) GS:ffff88826fa8a000(0000) knlGS:0000000000000000 [ 60.686837] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 60.689027] CR2: 000056325369e000 CR3: 000000011088b002 CR4: 0000000000370ef0 [ 60.691623] Call Trace: [ 60.692821] [ 60.693960] ? __pfx_verbose+0x10/0x10 [ 60.695656] ? __pfx_disasm_kfunc_name+0x10/0x10 [ 60.697495] check_cond_jmp_op+0x16f7/0x39b0 [ 60.699237] do_check+0x58fa/0xab10 ... Further analysis shows the warning is at line 4302 as below: 4294 /* static subprog call instruction, which 4295 * means that we are exiting current subprog, 4296 * so only r1-r5 could be still requested as 4297 * precise, r0 and r6-r10 or any stack slot in 4298 * the current frame should be zero by now 4299 */ 4300 if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { 4301 verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); 4302 WARN_ONCE(1, "verifier backtracking bug"); 4303 return -EFAULT; 4304 } With the below test (also in the next patch): __used __naked static void __bpf_jmp_r10(void) { asm volatile ( "r2 = 2314885393468386424 ll;" "goto +0;" "if r2 <= r10 goto +3;" "if r1 >= -1835016 goto +0;" "if r2 <= 8 goto +0;" "if r3 <= 0 goto +0;" "exit;" ::: __clobber_all); } SEC("?raw_tp") __naked void bpf_jmp_r10(void) { asm volatile ( "r3 = 0 ll;" "call __bpf_jmp_r10;" "r0 = 0;" "exit;" ::: __clobber_all); } The following is the verifier failure log: 0: (18) r3 = 0x0 ; R3_w=0 2: (85) call pc+2 caller: R10=fp0 callee: frame1: R1=ctx() R3_w=0 R10=fp0 5: frame1: R1=ctx() R3_w=0 R10=fp0 ; asm volatile (" \ @ verifier_precision.c:184 5: (18) r2 = 0x20202000256c6c78 ; frame1: R2_w=0x20202000256c6c78 7: (05) goto pc+0 8: (bd) if r2 <= r10 goto pc+3 ; frame1: R2_w=0x20202000256c6c78 R10=fp0 9: (35) if r1 >= 0xffe3fff8 goto pc+0 ; frame1: R1=ctx() 10: (b5) if r2 <= 0x8 goto pc+0 mark_precise: frame1: last_idx 10 first_idx 0 subseq_idx -1 mark_precise: frame1: regs=r2 stack= before 9: (35) if r1 >= 0xffe3fff8 goto pc+0 mark_precise: frame1: regs=r2 stack= before 8: (bd) if r2 <= r10 goto pc+3 mark_precise: frame1: regs=r2,r10 stack= before 7: (05) goto pc+0 mark_precise: frame1: regs=r2,r10 stack= before 5: (18) r2 = 0x20202000256c6c78 mark_precise: frame1: regs=r10 stack= before 2: (85) call pc+2 BUG regs 400 The main failure reason is due to r10 in precision backtracking bookkeeping. Actually r10 is always precise and there is no need to add it for the precision backtracking bookkeeping. One way to fix the issue is to prevent bt_set_reg() if any src/dst reg is r10. Andrii suggested to go with push_insn_history() approach to avoid explicitly checking r10 in backtrack_insn(). This patch added push_insn_history() support for cond_jmp like 'rX rY' operations. In check_cond_jmp_op(), if any of rX or rY is a stack pointer, push_insn_history() will record such information, and later backtrack_insn() will do bt_set_reg() properly for those register(s). [1] https://lore.kernel.org/bpf/Z%2F8q3xzpU59CIYQE@ly-workstation/ Reported by: Yi Lai Fixes: 407958a0e980 ("bpf: encapsulate precision backtracking bookkeeping") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250524041335.4046126-1-yonghong.song@linux.dev --- include/linux/bpf_verifier.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 78c97e12ea4e..256274acb1d8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -356,7 +356,11 @@ enum { INSN_F_SPI_MASK = 0x3f, /* 6 bits */ INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ - INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */ + INSN_F_STACK_ACCESS = BIT(9), + + INSN_F_DST_REG_STACK = BIT(10), /* dst_reg is PTR_TO_STACK */ + INSN_F_SRC_REG_STACK = BIT(11), /* src_reg is PTR_TO_STACK */ + /* total 12 bits are used now. */ }; static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); @@ -365,9 +369,9 @@ static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); struct bpf_insn_hist_entry { u32 idx; /* insn idx can't be bigger than 1 million */ - u32 prev_idx : 22; - /* special flags, e.g., whether insn is doing register stack spill/load */ - u32 flags : 10; + u32 prev_idx : 20; + /* special INSN_F_xxx flags */ + u32 flags : 12; /* additional registers that need precision tracking when this * jump is backtracked, vector of six 10-bit records */ -- cgit v1.2.3 From e9cb929670a1e98b592b30f03f06e9e20110f318 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 22 May 2025 13:21:47 +0200 Subject: net: phy: fix up const issues in to_mdio_device() and to_phy_device() Both to_mdio_device() and to_phy_device() "throw away" the const pointer attribute passed to them and return a non-const pointer, which generally is not a good thing overall. Fix this up by using container_of_const() which was designed for this very problem. Cc: Alexander Lobakin Cc: Andrew Lunn Cc: Heiner Kallweit Cc: Russell King Fixes: 7eab14de73a8 ("mdio, phy: fix -Wshadow warnings triggered by nested container_of()") Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2025052246-conduit-glory-8fc9@gregkh Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 5 +---- include/linux/phy.h | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 3c3deac57894..e43ff9f980a4 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -45,10 +45,7 @@ struct mdio_device { unsigned int reset_deassert_delay; }; -static inline struct mdio_device *to_mdio_device(const struct device *dev) -{ - return container_of(dev, struct mdio_device, dev); -} +#define to_mdio_device(__dev) container_of_const(__dev, struct mdio_device, dev) /* struct mdio_driver_common: Common to all MDIO drivers */ struct mdio_driver_common { diff --git a/include/linux/phy.h b/include/linux/phy.h index 32b9da274115..e194dad1623d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -744,10 +744,7 @@ struct phy_device { #define PHY_F_NO_IRQ 0x80000000 #define PHY_F_RXC_ALWAYS_ON 0x40000000 -static inline struct phy_device *to_phy_device(const struct device *dev) -{ - return container_of(to_mdio_device(dev), struct phy_device, mdio); -} +#define to_phy_device(__dev) container_of_const(to_mdio_device(__dev), struct phy_device, mdio) /** * struct phy_tdr_config - Configuration of a TDR raw test -- cgit v1.2.3 From 81edb1ba3232afd45ae7f3f492a91019571b18c9 Mon Sep 17 00:00:00 2001 From: Fan Ni Date: Mon, 5 May 2025 11:22:42 -0700 Subject: mm/hugetlb: refactor unmap_hugepage_range() to take folio instead of page The function unmap_hugepage_range() has two kinds of users: 1) unmap_ref_private(), which passes in the head page of a folio. Since unmap_ref_private() already takes folio and there are no other uses of the folio struct in the function, it is natural for unmap_hugepage_range() to take folio also. 2) All other uses, which pass in NULL pointer. In both cases, we can pass in folio. Refactor unmap_hugepage_range() to take folio. Link: https://lkml.kernel.org/r/20250505182345.506888-4-nifan.cxl@gmail.com Signed-off-by: Fan Ni Reviewed-by: Muchun Song Reviewed-by: Sidhartha Kumar Reviewed-by: Oscar Salvador Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: "Vishal Moola (Oracle)" Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 23ebf49c5d6a..f6d5f24e793c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -129,8 +129,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); void unmap_hugepage_range(struct vm_area_struct *, - unsigned long, unsigned long, struct page *, - zap_flags_t); + unsigned long start, unsigned long end, + struct folio *, zap_flags_t); void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, -- cgit v1.2.3 From 7f4b6065d9a842721a04632fc219aa453d1b2f5c Mon Sep 17 00:00:00 2001 From: Fan Ni Date: Mon, 5 May 2025 11:22:43 -0700 Subject: mm/hugetlb: refactor __unmap_hugepage_range() to take folio instead of page The function __unmap_hugepage_range() has two kinds of users: 1) unmap_hugepage_range(), which passes in the head page of a folio. Since unmap_hugepage_range() already takes folio and there are no other uses of the folio struct in the function, it is natural for __unmap_hugepage_range() to take folio also. 2) All other uses, which pass in NULL pointer. In both cases, we can pass in folio. Refactor __unmap_hugepage_range() to take folio. Link: https://lkml.kernel.org/r/20250505182345.506888-5-nifan.cxl@gmail.com Signed-off-by: Fan Ni Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Sidhartha Kumar Cc: "Vishal Moola (Oracle)" Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f6d5f24e793c..eb21619206af 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -134,7 +134,7 @@ void unmap_hugepage_range(struct vm_area_struct *, void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page, zap_flags_t zap_flags); + struct folio *, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo_node(int nid); @@ -455,7 +455,7 @@ static inline long hugetlb_change_protection( static inline void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page, + unsigned long end, struct folio *folio, zap_flags_t zap_flags) { BUG(); -- cgit v1.2.3 From 375700bab5b150e876e42d894a9a7470881f8a61 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 May 2025 13:13:11 -0600 Subject: llist: make llist_add_batch() a static inline The function is small enough that it should be, and it's a (very) hot path for io_uring. Doing this actually reduces my vmlinux text size for my standard build/test box. Before: axboe@r7625 ~/g/linux (test)> size vmlinux text data bss dec hex filename 19892174 5938310 2470432 28300916 1afd674 vmlinux After: axboe@r7625 ~/g/linux (test)> size vmlinux text data bss dec hex filename 19891878 5938310 2470436 28300624 1afd550 vmlinux Link: https://lkml.kernel.org/r/f1d104c6-7ac8-457a-a53d-6bb741421b2f@kernel.dk Signed-off-by: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/llist.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/llist.h b/include/linux/llist.h index 2c982ff7475a..27b17f64bcee 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -223,9 +223,26 @@ static inline struct llist_node *llist_next(struct llist_node *node) return node->next; } -extern bool llist_add_batch(struct llist_node *new_first, - struct llist_node *new_last, - struct llist_head *head); +/** + * llist_add_batch - add several linked entries in batch + * @new_first: first entry in batch to be added + * @new_last: last entry in batch to be added + * @head: the head for your lock-less list + * + * Return whether list is empty before adding. + */ +static inline bool llist_add_batch(struct llist_node *new_first, + struct llist_node *new_last, + struct llist_head *head) +{ + struct llist_node *first = READ_ONCE(head->first); + + do { + new_last->next = first; + } while (!try_cmpxchg(&head->first, &first, new_first)); + + return !first; +} static inline bool __llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, -- cgit v1.2.3 From 206cc44588f72b49ad4d7e21a7472ab2a72a83df Mon Sep 17 00:00:00 2001 From: Sami Uddin Date: Mon, 12 May 2025 07:51:53 +0930 Subject: virtio: reject shm region if length is zero Prevent usage of shared memory regions where the length is zero, as such configurations are not valid and may lead to unexpected behavior. Signed-off-by: Sami Uddin Message-Id: <20250511222153.2332-1-sami.md.ko@gmail.com> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 169c7d367fac..b3e1d30c765b 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -329,6 +329,8 @@ static inline bool virtio_get_shm_region(struct virtio_device *vdev, struct virtio_shm_region *region, u8 id) { + if (!region->len) + return false; if (!vdev->config->get_shm_region) return false; return vdev->config->get_shm_region(vdev, region, id); -- cgit v1.2.3 From 34ecde3c56066ba79e5ec3d93c5b14ea83e3603e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 27 May 2025 17:01:31 -0600 Subject: iomap: don't lose folio dropbehind state for overwrites DONTCACHE I/O must have the completion punted to a workqueue, just like what is done for unwritten extents, as the completion needs task context to perform the invalidation of the folio(s). However, if writeback is started off filemap_fdatawrite_range() off generic_sync() and it's an overwrite, then the DONTCACHE marking gets lost as iomap_add_to_ioend() don't look at the folio being added and no further state is passed down to help it know that this is a dropbehind/DONTCACHE write. Check if the folio being added is marked as dropbehind, and set IOMAP_IOEND_DONTCACHE if that is the case. Then XFS can factor this into the decision making of completion context in xfs_submit_ioend(). Additionally include this ioend flag in the NOMERGE flags, to avoid mixing it with unrelated IO. Since this is the 3rd flag that will cause XFS to punt the completion to a workqueue, add a helper so that each one of them can get appropriately commented. This fixes extra page cache being instantiated when the write performed is an overwrite, rather than newly instantiated blocks. Fixes: b2cd5ae693a3 ("iomap: make buffered writes work with RWF_DONTCACHE") Signed-off-by: Jens Axboe Link: https://lore.kernel.org/5153f6e8-274d-4546-bf55-30a5018e0d03@kernel.dk Reviewed-by: Dave Chinner Signed-off-by: Christian Brauner --- include/linux/iomap.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 68416b135151..522644d62f30 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -377,13 +377,16 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, #define IOMAP_IOEND_BOUNDARY (1U << 2) /* is direct I/O */ #define IOMAP_IOEND_DIRECT (1U << 3) +/* is DONTCACHE I/O */ +#define IOMAP_IOEND_DONTCACHE (1U << 4) /* * Flags that if set on either ioend prevent the merge of two ioends. * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) */ #define IOMAP_IOEND_NOMERGE_FLAGS \ - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT | \ + IOMAP_IOEND_DONTCACHE) /* * Structure for writeback I/O completions. -- cgit v1.2.3 From 793908d60b8745c386b9f4e29eb702f74ceb0886 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 24 Apr 2025 10:34:04 +0200 Subject: PCI: endpoint: Retain fixed-size BAR size as well as aligned size When allocating space for an endpoint function on a BAR with a fixed size, the size saved in 'struct pci_epf_bar.size' should be the fixed size as expected by pci_epc_set_bar(). However, if pci_epf_alloc_space() increased the allocation size to accommodate iATU alignment requirements, it previously saved the larger aligned size in .size, which broke pci_epc_set_bar(). To solve this, keep the fixed BAR size in .size and save the aligned size in a new .aligned_size for use when deallocating it. Fixes: 2a9a801620ef ("PCI: endpoint: Add support to specify alignment for buffers allocated to BARs") Signed-off-by: Jerome Brunet [mani: commit message fixup] Signed-off-by: Manivannan Sadhasivam [bhelgaas: more specific subject, commit log, wrap comment to match file] Signed-off-by: Bjorn Helgaas Reviewed-by: Niklas Cassel Link: https://patch.msgid.link/20250424-pci-ep-size-alignment-v5-1-2d4ec2af23f5@baylibre.com --- include/linux/pci-epf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 879d19cebd4f..749cee0bcf2c 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -114,6 +114,8 @@ struct pci_epf_driver { * @phys_addr: physical address that should be mapped to the BAR * @addr: virtual address corresponding to the @phys_addr * @size: the size of the address space present in BAR + * @aligned_size: the size actually allocated to accommodate the iATU alignment + * requirement * @barno: BAR number * @flags: flags that are set for the BAR */ @@ -121,6 +123,7 @@ struct pci_epf_bar { dma_addr_t phys_addr; void *addr; size_t size; + size_t aligned_size; enum pci_barno barno; int flags; }; -- cgit v1.2.3 From d2e1d783f2c619cf9d8242b67a0ab0d2915f2920 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 11 Apr 2025 15:47:32 -0400 Subject: NFS: Add support for fallocate(FALLOC_FL_ZERO_RANGE) This implements a suggestion from Trond that we can mimic FALLOC_FL_ZERO_RANGE by sending a compound that first does a DEALLOCATE to punch a hole in a file, and then an ALLOCATE to fill the hole with zeroes. There might technically be a race here, but once the DEALLOCATE finishes any reads from the region would return zeroes anyway, so I don't expect it to cause problems. Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 1 + include/linux/nfs_fs_sb.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index d8cad844870a..07635f87a3fd 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -678,6 +678,7 @@ enum { NFSPROC4_CLNT_SEEK, NFSPROC4_CLNT_ALLOCATE, NFSPROC4_CLNT_DEALLOCATE, + NFSPROC4_CLNT_ZERO_RANGE, NFSPROC4_CLNT_LAYOUTSTATS, NFSPROC4_CLNT_CLONE, NFSPROC4_CLNT_COPY, diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index e02f4c4e9cc4..63141320c2a8 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -304,6 +304,7 @@ struct nfs_server { #define NFS_CAP_CASE_PRESERVING (1U << 7) #define NFS_CAP_REBOOT_LAYOUTRETURN (1U << 8) #define NFS_CAP_OFFLOAD_STATUS (1U << 9) +#define NFS_CAP_ZERO_RANGE (1U << 10) #define NFS_CAP_OPEN_XOR (1U << 12) #define NFS_CAP_DELEGTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) -- cgit v1.2.3 From 77be29b7a3f896bd96808ba6781481a1f6afa66c Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 1 May 2025 08:29:42 -0400 Subject: NFSv4: Allow FREE_STATEID to clean up delegations The NFS client's list of delegations can grow quite large (well beyond the delegation watermark) if the server is revoking or there are repeated events that expire state. Once this happens, the revoked delegations can cause a performance problem for subsequent walks of the servers->delegations list when the client tries to test and free state. If we can determine that the FREE_STATEID operation has completed without error, we can prune the delegation from the list. Since the NFS client combines TEST_STATEID with FREE_STATEID in its minor version operations, there isn't an easy way to communicate success of FREE_STATEID. Rather than re-arrange quite a number of calling paths to break out the separate procedures, let's signal the success of FREE_STATEID by setting the stateid's type. Set NFS4_FREED_STATEID_TYPE for stateids that have been successfully discarded from the server, and use that type to signal that the delegation can be cleaned up. Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 07635f87a3fd..e947af6a3684 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -72,6 +72,7 @@ struct nfs4_stateid_struct { NFS4_LAYOUT_STATEID_TYPE, NFS4_PNFS_DS_STATEID_TYPE, NFS4_REVOKED_STATEID_TYPE, + NFS4_FREED_STATEID_TYPE, } type; }; -- cgit v1.2.3 From 77e82fb2c6c27c122e785f543ae0062f7783c886 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 May 2025 10:46:39 +1000 Subject: nfs_localio: always hold nfsd net ref with nfsd_file ref Having separate nfsd_file_put and nfsd_file_put_local in struct nfsd_localio_operations doesn't make much sense. The difference is that nfsd_file_put doesn't drop a reference to the nfs_net which is what keeps nfsd from shutting down. Currently, if nfsd tries to shutdown it will invalidate the files stored in the list from the nfs_uuid and this will drop all references to the nfsd net that the client holds. But the client could still hold some references to nfsd_files for active IO. So nfsd might think is has completely shut down local IO, but hasn't and has no way to wait for those active IO requests to complete. So this patch changes nfsd_file_get to nfsd_file_get_local and has it increase the ref count on the nfsd net and it replaces all calls to ->nfsd_put_file to ->nfsd_put_file_local. It also changes ->nfsd_open_local_fh to return with the refcount on the net elevated precisely when a valid nfsd_file is returned. This means that whenever the client holds a valid nfsd_file, there will be an associated count on the nfsd net, and so the count can only reach zero when all nfsd_files have been returned. nfs_local_file_put() is changed to call nfs_to_nfsd_file_put_local() instead of replacing calls to one with calls to the other because this will help a later patch which changes nfs_to_nfsd_file_put_local() to take an __rcu pointer while nfs_local_file_put() doesn't. Fixes: 86e00412254a ("nfs: cache all open LOCALIO nfsd_file(s) in client") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 9aa8a43843d7..c3f34bae60e1 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -66,8 +66,7 @@ struct nfsd_localio_operations { const struct nfs_fh *, const fmode_t); struct net *(*nfsd_file_put_local)(struct nfsd_file *); - struct nfsd_file *(*nfsd_file_get)(struct nfsd_file *); - void (*nfsd_file_put)(struct nfsd_file *); + struct nfsd_file *(*nfsd_file_get_local)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); } ____cacheline_aligned; -- cgit v1.2.3 From c25a89770d1f216dcedfc2d25d56b604f62ce0bd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 May 2025 10:46:43 +1000 Subject: nfs_localio: change nfsd_file_put_local() to take a pointer to __rcu pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of calling xchg() and unrcu_pointer() before nfsd_file_put_local(), we now pass pointer to the __rcu pointer and call xchg() and unrcu_pointer() inside that function. Where unrcu_pointer() is currently called the internals of "struct nfsd_file" are not known and that causes older compilers such as gcc-8 to complain. In some cases we have a __kernel (aka normal) pointer not an __rcu pointer so we need to cast it to __rcu first. This is strictly a weakening so no information is lost. Somewhat surprisingly, this cast is accepted by gcc-8. This has the pleasing result that the cmpxchg() which sets ro_file and rw_file, and also the xchg() which clears them, are both now in the nfsd code. Reported-by: Pali Rohár Reported-by: Vincent Mailhol Fixes: 86e00412254a ("nfs: cache all open LOCALIO nfsd_file(s) in client") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index c3f34bae60e1..5c7c92659e73 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -50,10 +50,6 @@ void nfs_localio_invalidate_clients(struct list_head *nn_local_clients, spinlock_t *nn_local_clients_lock); /* localio needs to map filehandle -> struct nfsd_file */ -extern struct nfsd_file * -nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *, - const struct cred *, const struct nfs_fh *, - const fmode_t) __must_hold(rcu); void nfs_close_local_fh(struct nfs_file_localio *); struct nfsd_localio_operations { @@ -64,8 +60,9 @@ struct nfsd_localio_operations { struct rpc_clnt *, const struct cred *, const struct nfs_fh *, + struct nfsd_file __rcu **pnf, const fmode_t); - struct net *(*nfsd_file_put_local)(struct nfsd_file *); + struct net *(*nfsd_file_put_local)(struct nfsd_file __rcu **); struct nfsd_file *(*nfsd_file_get_local)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); } ____cacheline_aligned; @@ -76,6 +73,7 @@ extern const struct nfsd_localio_operations *nfs_to; struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *, struct rpc_clnt *, const struct cred *, const struct nfs_fh *, struct nfs_file_localio *, + struct nfsd_file __rcu **pnf, const fmode_t); static inline void nfs_to_nfsd_net_put(struct net *net) @@ -90,16 +88,19 @@ static inline void nfs_to_nfsd_net_put(struct net *net) rcu_read_unlock(); } -static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) +static inline void nfs_to_nfsd_file_put_local(struct nfsd_file __rcu **localio) { /* - * Must not hold RCU otherwise nfsd_file_put() can easily trigger: - * "Voluntary context switch within RCU read-side critical section!" - * by scheduling deep in underlying filesystem (e.g. XFS). + * Either *localio must be guaranteed to be non-NULL, or caller + * must prevent nfsd shutdown from completing as nfs_close_local_fh() + * does by blocking the nfs_uuid from being finally put. */ - struct net *net = nfs_to->nfsd_file_put_local(localio); + struct net *net; - nfs_to_nfsd_net_put(net); + net = nfs_to->nfsd_file_put_local(localio); + + if (net) + nfs_to_nfsd_net_put(net); } #else /* CONFIG_NFS_LOCALIO */ -- cgit v1.2.3 From f62da6e7270c2db5aef8a8b14f465896961a9372 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 14 May 2025 09:43:18 +0200 Subject: PCI: endpoint: Align pci_epc_set_msi(), pci_epc_ops::set_msi() nr_irqs encoding The kdoc for pci_epc_set_msi() says: "Invoke to set the required number of MSI interrupts." The kdoc for the callback pci_epc_ops::set_msi() says: "ops to set the requested number of MSI interrupts in the MSI capability register" pci_epc_ops::set_msi() does however expect the parameter 'interrupts' to be in the encoding as defined by the Multiple Message Capable (MMC) field of the MSI capability structure. Nowhere in the kdoc does it say that the number of interrupts should be in MMC encoding. It is very confusing that the API pci_epc_set_msi() and the callback function pci_epc_ops::set_msi() both take a parameter named 'interrupts', but they expect completely different encodings. Clean up the API and the callback function to have the same semantics, i.e. the parameter represents the number of interrupts, regardless of the internal encoding of that value. Also rename the parameter 'interrupts' to 'nr_irqs', in both the wrapper function and the callback function, such that the name is unambiguous. [bhelgaas: more specific subject] Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Cc: stable+noautosel@kernel.org # this is simply a cleanup Link: https://patch.msgid.link/20250514074313.283156-13-cassel@kernel.org --- include/linux/pci-epc.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 82837008b56f..15d10c07c9f1 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -100,7 +100,7 @@ struct pci_epc_ops { void (*unmap_addr)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t addr); int (*set_msi)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, - u8 interrupts); + u8 nr_irqs); int (*get_msi)(struct pci_epc *epc, u8 func_no, u8 vfunc_no); int (*set_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, u16 interrupts, enum pci_barno, u32 offset); @@ -286,8 +286,7 @@ int pci_epc_map_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, u64 pci_addr, size_t size); void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr); -int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no, - u8 interrupts); +int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no, u8 nr_irqs); int pci_epc_get_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no); int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no, u16 interrupts, enum pci_barno, u32 offset); -- cgit v1.2.3 From de0321bcc5fdd83631f0c2a6fdebfe0ad4e23449 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 14 May 2025 09:43:19 +0200 Subject: PCI: endpoint: Align pci_epc_set_msix(), pci_epc_ops::set_msix() nr_irqs encoding The kdoc for pci_epc_set_msix() says: "Invoke to set the required number of MSI-X interrupts." The kdoc for the callback pci_epc_ops->set_msix() says: "ops to set the requested number of MSI-X interrupts in the MSI-X capability register" pci_epc_ops::set_msix() does however expect the parameter 'interrupts' to be in the encoding as defined by the Table Size field. Nowhere in the kdoc does it say that the number of interrupts should be in Table Size encoding. It is very confusing that the API pci_epc_set_msix() and the callback function pci_epc_ops::set_msix() both take a parameter named 'interrupts', but they expect completely different encodings. Clean up the API and the callback function to have the same semantics, i.e. the parameter represents the number of interrupts, regardless of the internal encoding of that value. Also rename the parameter 'interrupts' to 'nr_irqs', in both the wrapper function and the callback function, such that the name is unambiguous. [bhelgaas: more specific subject] Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Cc: stable+noautosel@kernel.org # this is simply a cleanup Link: https://patch.msgid.link/20250514074313.283156-14-cassel@kernel.org --- include/linux/pci-epc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 15d10c07c9f1..4286bfdbfdfa 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -103,7 +103,7 @@ struct pci_epc_ops { u8 nr_irqs); int (*get_msi)(struct pci_epc *epc, u8 func_no, u8 vfunc_no); int (*set_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, - u16 interrupts, enum pci_barno, u32 offset); + u16 nr_irqs, enum pci_barno, u32 offset); int (*get_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no); int (*raise_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, unsigned int type, u16 interrupt_num); @@ -288,8 +288,8 @@ void pci_epc_unmap_addr(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr); int pci_epc_set_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no, u8 nr_irqs); int pci_epc_get_msi(struct pci_epc *epc, u8 func_no, u8 vfunc_no); -int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no, - u16 interrupts, enum pci_barno, u32 offset); +int pci_epc_set_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no, u16 nr_irqs, + enum pci_barno, u32 offset); int pci_epc_get_msix(struct pci_epc *epc, u8 func_no, u8 vfunc_no); int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u8 interrupt_num, -- cgit v1.2.3 From c31f91c6af96a5eb0632f4aee8d4e39cad7d7559 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Apr 2025 10:53:58 +0200 Subject: fuse: don't allow signals to interrupt getdents copying When getting the directory contents, the entries are first fetched to a kernel buffer, then they are copied to userspace with dir_emit(). This second phase is non-blocking as long as the userspace buffer is not paged out, making it interruptible makes zero sense. Overload d_type as flags, since it only uses 4 bits from 32. Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..0f2a1a572e3a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2073,6 +2073,9 @@ struct dir_context { loff_t pos; }; +/* If OR-ed with d_type, pending signals are not checked */ +#define FILLDIR_FLAG_NOINTR 0x1000 + /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. -- cgit v1.2.3 From 467e245d47e6662a4cbf4184d9e6d0b1b120c0bf Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Apr 2025 12:44:55 +0200 Subject: readdir: supply dir_context.count as readdir buffer size hint This is a preparation for large readdir buffers in fuse. Simply setting the fuse buffer size to the userspace buffer size should work, the record sizes are similar (fuse's is slightly larger than libc's, so no overflow should ever happen). Signed-off-by: Miklos Szeredi Signed-off-by: Jaco Kroon --- include/linux/fs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0f2a1a572e3a..dfc5a3327124 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2071,6 +2071,13 @@ typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, struct dir_context { filldir_t actor; loff_t pos; + /* + * Filesystems MUST NOT MODIFY count, but may use as a hint: + * 0 unknown + * > 0 space in buffer (assume at least one entry) + * INT_MAX unlimited + */ + int count; }; /* If OR-ed with d_type, pending signals are not checked */ -- cgit v1.2.3 From 5402c4d4d2000a9baa30c1157c97152ec6383733 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 25 May 2025 12:47:31 +0200 Subject: exportfs: require ->fh_to_parent() to encode connectable file handles When user requests a connectable file handle explicitly with the AT_HANDLE_CONNECTABLE flag, fail the request if filesystem (e.g. nfs) does not know how to decode a connected non-dir dentry. Fixes: c374196b2b9f ("fs: name_to_handle_at() support for "explicit connectable" file handles") Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/20250525104731.1461704-1-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index fc93f0abf513..25c4a5afbd44 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -314,6 +314,9 @@ static inline bool exportfs_can_decode_fh(const struct export_operations *nop) static inline bool exportfs_can_encode_fh(const struct export_operations *nop, int fh_flags) { + if (!nop) + return false; + /* * If a non-decodeable file handle was requested, we only need to make * sure that filesystem did not opt-out of encoding fid. @@ -321,6 +324,13 @@ static inline bool exportfs_can_encode_fh(const struct export_operations *nop, if (fh_flags & EXPORT_FH_FID) return exportfs_can_encode_fid(nop); + /* + * If a connectable file handle was requested, we need to make sure that + * filesystem can also decode connected file handles. + */ + if ((fh_flags & EXPORT_FH_CONNECTABLE) && !nop->fh_to_parent) + return false; + /* * If a decodeable file handle was requested, we need to make sure that * filesystem can also decode file handles. -- cgit v1.2.3 From d1c696dba120624256ab335ab8247f535b872309 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 8 May 2025 12:40:32 +0530 Subject: PCI: host-common: Convert to library for host controller drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This common library will be used as a placeholder for helper functions shared by the host controller drivers. This avoids placing the host controller drivers specific helpers in drivers/pci/*.c, to avoid enlarging the kernel image on platforms that do not use host controller drivers at all (like x86/ACPI platforms). Suggested-by: Lukas Wunner Signed-off-by: Manivannan Sadhasivam Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20250508-pcie-reset-slot-v4-3-7050093e2b50@linaro.org --- include/linux/pci-ecam.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 3a10f8cfc3ad..d930651473b4 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -93,10 +93,4 @@ extern const struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ extern const struct pci_ecam_ops tegra194_pcie_ops; /* Tegra194 PCIe */ extern const struct pci_ecam_ops loongson_pci_ecam_ops; /* Loongson PCIe */ #endif - -#if IS_ENABLED(CONFIG_PCI_HOST_COMMON) -/* for DT-based PCI controllers that support ECAM */ -int pci_host_common_probe(struct platform_device *pdev); -void pci_host_common_remove(struct platform_device *pdev); -#endif #endif -- cgit v1.2.3 From acc53a0b4c156877773da6e9eea4113dc7e770ae Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 14 May 2025 19:15:07 +0100 Subject: mm: rename page->index to page->__folio_index All users of page->index have been converted to not refer to it any more. Update a few pieces of documentation that were missed and prevent new users from appearing (or at least make them easy to grep for). Link: https://lkml.kernel.org/r/20250514181508.3019795-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- include/linux/mm_types.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index cd2e513189d6..5009c53ff1fe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1276,9 +1276,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf); * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * - * A page may belong to an inode's memory mapping. In this case, page->mapping - * is the pointer to the inode, and page->index is the file offset of the page, - * in units of PAGE_SIZE. + * A folio may belong to an inode's memory mapping. In this case, + * folio->mapping points to the inode, and folio->index is the file + * offset of the folio, in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e934dc6057c..17e0dcb87aae 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -107,7 +107,7 @@ struct page { /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; union { - pgoff_t index; /* Our offset within mapping. */ + pgoff_t __folio_index; /* Our offset within mapping. */ unsigned long share; /* share count for fsdax */ }; /** @@ -488,7 +488,7 @@ FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); FOLIO_MATCH(mapping, mapping); FOLIO_MATCH(compound_head, lru); -FOLIO_MATCH(index, index); +FOLIO_MATCH(__folio_index, index); FOLIO_MATCH(private, private); FOLIO_MATCH(_mapcount, _mapcount); FOLIO_MATCH(_refcount, _refcount); @@ -589,7 +589,7 @@ TABLE_MATCH(flags, __page_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); -TABLE_MATCH(index, pt_index); +TABLE_MATCH(__folio_index, pt_index); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); TABLE_MATCH(_refcount, __page_refcount); -- cgit v1.2.3 From d9736929445e7f4c60f0093af61ff0b52e2d4412 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 14 May 2025 18:06:04 +0100 Subject: iov: remove copy_page_from_iter_atomic() All callers now use copy_folio_from_iter_atomic(), so convert copy_page_from_iter_atomic(). While I'm in there, use kmap_local_folio() and pagefault_disable() instead of kmap_atomic(). That allows preemption and/or task migration to happen during the copy_from_user(). Also use the new folio_test_partial_kmap() predicate instead of open-coding it. Link: https://lkml.kernel.org/r/20250514170607.3000994-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Alexander Viro Cc: Hugh Dickins Cc: Konstantin Komarov Signed-off-by: Andrew Morton --- include/linux/uio.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 49ece9e1888f..e46477482663 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -176,8 +176,6 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) return ret; } -size_t copy_page_from_iter_atomic(struct page *page, size_t offset, - size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); @@ -187,6 +185,8 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); +size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, + size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); @@ -204,12 +204,6 @@ static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset, return copy_page_from_iter(&folio->page, offset, bytes, i); } -static inline size_t copy_folio_from_iter_atomic(struct folio *folio, - size_t offset, size_t bytes, struct iov_iter *i) -{ - return copy_page_from_iter_atomic(&folio->page, offset, bytes, i); -} - size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); -- cgit v1.2.3 From 940b01fc8dc1aead398819215650727cb9e7335e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sun, 18 May 2025 23:31:39 -0700 Subject: memcg: nmi safe memcg stats for specific archs There are archs which have NMI but does not support this_cpu_* ops safely in the nmi context but they support safe atomic ops in nmi context. For such archs, let's add infra to use atomic ops for the memcg stats which can be updated in nmi. At the moment, the memcg stats which get updated in the objcg charging path are MEMCG_KMEM, NR_SLAB_RECLAIMABLE_B & NR_SLAB_UNRECLAIMABLE_B. Rather than adding support for all memcg stats to be nmi safe, let's just add infra to make these three stats nmi safe which this patch is doing. Link: https://lkml.kernel.org/r/20250519063142.111219-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f7848f73f41c..87b6688f124a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -113,6 +113,12 @@ struct mem_cgroup_per_node { CACHELINE_PADDING(_pad2_); unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; + +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC + /* slab stats for nmi context */ + atomic_t slab_reclaimable; + atomic_t slab_unreclaimable; +#endif }; struct mem_cgroup_threshold { @@ -236,6 +242,10 @@ struct mem_cgroup { atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; +#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC + /* MEMCG_KMEM for nmi context */ + atomic_t kmem_stat; +#endif /* * Hint of reclaim pressure for socket memroy management. Note * that this indicator should NOT be used in legacy cgroup mode -- cgit v1.2.3 From 49c69504f4d340d870f2c3f3d2f404c118ff7b23 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 23 May 2025 00:30:17 +0200 Subject: mmu_notifiers: remove leftover stub macros Commit ec8832d007cb ("mmu_notifiers: don't invalidate secondary TLBs as part of mmu_notifier_invalidate_range_end()") removed the main definitions of {ptep,pmdp_huge,pudp_huge}_clear_flush_notify; just their !CONFIG_MMU_NOTIFIER stubs are left behind, remove them. Link: https://lkml.kernel.org/r/20250523-mmu-notifier-cleanup-unused-v1-1-cc1f47ebec33@google.com Signed-off-by: Jann Horn Reviewed-by: Alistair Popple Reviewed-by: Qi Zheng Reviewed-by: Jason Gunthorpe Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index bc2402a45741..d1094c2d5fb6 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -654,9 +654,6 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young -#define ptep_clear_flush_notify ptep_clear_flush -#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush -#define pudp_huge_clear_flush_notify pudp_huge_clear_flush static inline void mmu_notifier_synchronize(void) { -- cgit v1.2.3 From e13e7922d03439e374c263049af5f740ceae6346 Mon Sep 17 00:00:00 2001 From: Juan Yescas Date: Wed, 21 May 2025 14:57:45 -0700 Subject: mm: add CONFIG_PAGE_BLOCK_ORDER to select page block order Problem: On large page size configurations (16KiB, 64KiB), the CMA alignment requirement (CMA_MIN_ALIGNMENT_BYTES) increases considerably, and this causes the CMA reservations to be larger than necessary. This means that system will have less available MIGRATE_UNMOVABLE and MIGRATE_RECLAIMABLE page blocks since MIGRATE_CMA can't fallback to them. The CMA_MIN_ALIGNMENT_BYTES increases because it depends on MAX_PAGE_ORDER which depends on ARCH_FORCE_MAX_ORDER. The value of ARCH_FORCE_MAX_ORDER increases on 16k and 64k kernels. For example, in ARM, the CMA alignment requirement when: - CONFIG_ARCH_FORCE_MAX_ORDER default value is used - CONFIG_TRANSPARENT_HUGEPAGE is set: PAGE_SIZE | MAX_PAGE_ORDER | pageblock_order | CMA_MIN_ALIGNMENT_BYTES ----------------------------------------------------------------------- 4KiB | 10 | 9 | 4KiB * (2 ^ 9) = 2MiB 16Kib | 11 | 11 | 16KiB * (2 ^ 11) = 32MiB 64KiB | 13 | 13 | 64KiB * (2 ^ 13) = 512MiB There are some extreme cases for the CMA alignment requirement when: - CONFIG_ARCH_FORCE_MAX_ORDER maximum value is set - CONFIG_TRANSPARENT_HUGEPAGE is NOT set: - CONFIG_HUGETLB_PAGE is NOT set PAGE_SIZE | MAX_PAGE_ORDER | pageblock_order | CMA_MIN_ALIGNMENT_BYTES ------------------------------------------------------------------------ 4KiB | 15 | 15 | 4KiB * (2 ^ 15) = 128MiB 16Kib | 13 | 13 | 16KiB * (2 ^ 13) = 128MiB 64KiB | 13 | 13 | 64KiB * (2 ^ 13) = 512MiB This affects the CMA reservations for the drivers. If a driver in a 4KiB kernel needs 4MiB of CMA memory, in a 16KiB kernel, the minimal reservation has to be 32MiB due to the alignment requirements: reserved-memory { ... cma_test_reserve: cma_test_reserve { compatible = "shared-dma-pool"; size = <0x0 0x400000>; /* 4 MiB */ ... }; }; reserved-memory { ... cma_test_reserve: cma_test_reserve { compatible = "shared-dma-pool"; size = <0x0 0x2000000>; /* 32 MiB */ ... }; }; Solution: Add a new config CONFIG_PAGE_BLOCK_ORDER that allows to set the page block order in all the architectures. The maximum page block order will be given by ARCH_FORCE_MAX_ORDER. By default, CONFIG_PAGE_BLOCK_ORDER will have the same value that ARCH_FORCE_MAX_ORDER. This will make sure that current kernel configurations won't be affected by this change. It is a opt-in change. This patch will allow to have the same CMA alignment requirements for large page sizes (16KiB, 64KiB) as that in 4kb kernels by setting a lower pageblock_order. Tests: - Verified that HugeTLB pages work when pageblock_order is 1, 7, 10 on 4k and 16k kernels. - Verified that Transparent Huge Pages work when pageblock_order is 1, 7, 10 on 4k and 16k kernels. - Verified that dma-buf heaps allocations work when pageblock_order is 1, 7, 10 on 4k and 16k kernels. Benchmarks: The benchmarks compare 16kb kernels with pageblock_order 10 and 7. The reason for the pageblock_order 7 is because this value makes the min CMA alignment requirement the same as that in 4kb kernels (2MB). - Perform 100K dma-buf heaps (/dev/dma_heap/system) allocations of SZ_8M, SZ_4M, SZ_2M, SZ_1M, SZ_64, SZ_8, SZ_4. Use simpleperf (https://developer.android.com/ndk/guides/simpleperf) to measure the # of instructions and page-faults on 16k kernels. The benchmark was executed 10 times. The averages are below: # instructions | #page-faults order 10 | order 7 | order 10 | order 7 -------------------------------------------------------- 13,891,765,770 | 11,425,777,314 | 220 | 217 14,456,293,487 | 12,660,819,302 | 224 | 219 13,924,261,018 | 13,243,970,736 | 217 | 221 13,910,886,504 | 13,845,519,630 | 217 | 221 14,388,071,190 | 13,498,583,098 | 223 | 224 13,656,442,167 | 12,915,831,681 | 216 | 218 13,300,268,343 | 12,930,484,776 | 222 | 218 13,625,470,223 | 14,234,092,777 | 219 | 218 13,508,964,965 | 13,432,689,094 | 225 | 219 13,368,950,667 | 13,683,587,37 | 219 | 225 ------------------------------------------------------------------- 13,803,137,433 | 13,131,974,268 | 220 | 220 Averages There were 4.85% #instructions when order was 7, in comparison with order 10. 13,803,137,433 - 13,131,974,268 = -671,163,166 (-4.86%) The number of page faults in order 7 and 10 were the same. These results didn't show any significant regression when the pageblock_order is set to 7 on 16kb kernels. - Run speedometer 3.1 (https://browserbench.org/Speedometer3.1/) 5 times on the 16k kernels with pageblock_order 7 and 10. order 10 | order 7 | order 7 - order 10 | (order 7 - order 10) % ------------------------------------------------------------------- 15.8 | 16.4 | 0.6 | 3.80% 16.4 | 16.2 | -0.2 | -1.22% 16.6 | 16.3 | -0.3 | -1.81% 16.8 | 16.3 | -0.5 | -2.98% 16.6 | 16.8 | 0.2 | 1.20% ------------------------------------------------------------------- 16.44 16.4 -0.04 -0.24% Averages The results didn't show any significant regression when the pageblock_order is set to 7 on 16kb kernels. Link: https://lkml.kernel.org/r/20250521215807.1860663-1-jyescas@google.com Signed-off-by: Juan Yescas Acked-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: David Hildenbrand Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 16 ++++++++++++++++ include/linux/pageblock-flags.h | 8 ++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b19a98c20de8..87a667533d6d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -37,6 +37,22 @@ #define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) +/* Defines the order for the number of pages that have a migrate type. */ +#ifndef CONFIG_PAGE_BLOCK_ORDER +#define PAGE_BLOCK_ORDER MAX_PAGE_ORDER +#else +#define PAGE_BLOCK_ORDER CONFIG_PAGE_BLOCK_ORDER +#endif /* CONFIG_PAGE_BLOCK_ORDER */ + +/* + * The MAX_PAGE_ORDER, which defines the max order of pages to be allocated + * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_ORDER, + * which defines the order for the number of pages that can have a migrate type + */ +#if (PAGE_BLOCK_ORDER > MAX_PAGE_ORDER) +#error MAX_PAGE_ORDER must be >= PAGE_BLOCK_ORDER +#endif + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index fc6b9c87cb0a..e73a4292ef02 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,18 +41,18 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order MIN_T(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) +#define pageblock_order MIN_T(unsigned int, HUGETLB_PAGE_ORDER, PAGE_BLOCK_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #elif defined(CONFIG_TRANSPARENT_HUGEPAGE) -#define pageblock_order MIN_T(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER) +#define pageblock_order MIN_T(unsigned int, HPAGE_PMD_ORDER, PAGE_BLOCK_ORDER) #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order MAX_PAGE_ORDER +/* If huge pages are not used, group by PAGE_BLOCK_ORDER */ +#define pageblock_order PAGE_BLOCK_ORDER #endif /* CONFIG_HUGETLB_PAGE */ -- cgit v1.2.3 From ad6b26b6a0a79166b53209df2ca1cf8636296382 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 23 May 2025 20:51:15 +0800 Subject: sched/numa: add statistics of numa balance task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On systems with NUMA balancing enabled, it has been found that tracking task activities resulting from NUMA balancing is beneficial. NUMA balancing employs two mechanisms for task migration: one is to migrate a task to an idle CPU within its preferred node, and the other is to swap tasks located on different nodes when they are on each other's preferred nodes. The kernel already provides NUMA page migration statistics in /sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched. However, it lacks statistics regarding task migration and swapping. Therefore, relevant counts for task migration and swapping should be added. The following two new fields: numa_task_migrated numa_task_swapped will be shown in /sys/fs/cgroup/{GROUP}/memory.stat, /proc/{PID}/sched and /proc/vmstat. Introducing both per-task and per-memory cgroup (memcg) NUMA balancing statistics facilitates a rapid evaluation of the performance and resource utilization of the target workload. For instance, users can first identify the container with high NUMA balancing activity and then further pinpoint a specific task within that group, and subsequently adjust the memory policy for that task. In short, although it is possible to iterate through /proc/$pid/sched to locate the problematic task, the introduction of aggregated NUMA balancing activity for tasks within each memcg can assist users in identifying the task more efficiently through a divide-and-conquer approach. As Libo Chen pointed out, the memcg event relies on the text names in vmstat_text, and /proc/vmstat generates corresponding items based on vmstat_text. Thus, the relevant task migration and swapping events introduced in vmstat_text also need to be populated by count_vm_numa_event(), otherwise these values are zero in /proc/vmstat. In theory, task migration and swap events are part of the scheduler's activities. The reason for exposing them through the memory.stat/vmstat interface is that we already have NUMA balancing statistics in memory.stat/vmstat, and these events are closely related to each other. Following Shakeel's suggestion, we describe the end-to-end flow/story of all these events occurring on a timeline for future reference: The goal of NUMA balancing is to co-locate a task and its memory pages on the same NUMA node. There are two strategies: migrate the pages to the task's node, or migrate the task to the node where its pages reside. Suppose a task p1 is running on Node 0, but its pages are located on Node 1. NUMA page fault statistics for p1 reveal its "page footprint" across nodes. If NUMA balancing detects that most of p1's pages are on Node 1: 1.Page Migration Attempt: The Numa balance first tries to migrate p1's pages to Node 0. The numa_page_migrate counter increments. 2.Task Migration Strategies: After the page migration finishes, Numa balance checks every 1 second to see if p1 can be migrated to Node 1. Case 2.1: Idle CPU Available If Node 1 has an idle CPU, p1 is directly scheduled there. This event is logged as numa_task_migrated. Case 2.2: No Idle CPU (Task Swap) If all CPUs on Node1 are busy, direct migration could cause CPU contention or load imbalance. Instead: The Numa balance selects a candidate task p2 on Node 1 that prefers Node 0 (e.g., due to its own page footprint). p1 and p2 are swapped. This cross-node swap is recorded as numa_task_swapped. Link: https://lkml.kernel.org/r/d00edb12ba0f0de3c5222f61487e65f2ac58f5b1.1748493462.git.yu.c.chen@intel.com Link: https://lkml.kernel.org/r/7ef90a88602ed536be46eba7152ed0d33bad5790.1748002400.git.yu.c.chen@intel.com Signed-off-by: Chen Yu Tested-by: K Prateek Nayak Tested-by: Madadi Vineeth Reddy Acked-by: Peter Zijlstra (Intel) Tested-by: Venkat Rao Bagalkote Cc: Aubrey Li Cc: Ayush Jain Cc: "Chen, Tim C" Cc: Ingo Molnar Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Libo Chen Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/sched.h | 4 ++++ include/linux/vm_event_item.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..1c50e30b5c01 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,6 +549,10 @@ struct sched_statistics { u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; +#ifdef CONFIG_NUMA_BALANCING + u64 numa_task_migrated; + u64 numa_task_swapped; +#endif u64 nr_wakeups; u64 nr_wakeups_sync; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9e15a088ba38..91a3ce9a2687 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NUMA_HINT_FAULTS, NUMA_HINT_FAULTS_LOCAL, NUMA_PAGE_MIGRATE, + NUMA_TASK_MIGRATE, + NUMA_TASK_SWAP, #endif #ifdef CONFIG_MIGRATION PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, -- cgit v1.2.3 From 5c78e793f78732b60276401f75cc1a101f9ad121 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 30 May 2025 12:06:47 -0700 Subject: overflow: Introduce __DEFINE_FLEX for having no initializer While not yet in the tree, there is a proposed patch[1] that was depending on the prior behavior of _DEFINE_FLEX, which did not have an explicit initializer. Provide this via __DEFINE_FLEX now, which can also have attributes applied (e.g. __uninitialized). Examples of the resulting initializer behaviors can be seen here: https://godbolt.org/z/P7Go8Tr33 Link: https://lore.kernel.org/netdev/20250520205920.2134829-9-anthony.l.nguyen@intel.com [1] Fixes: 47e36ed78406 ("overflow: Fix direct struct member initialization in _DEFINE_FLEX()") Signed-off-by: Kees Cook --- include/linux/overflow.h | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 7b7be27ca113..154ed0dbb43f 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -389,24 +389,37 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) struct_size((type *)NULL, member, count) /** - * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. - * Enables caller macro to pass (different) initializer. + * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. + * Enables caller macro to pass arbitrary trailing expressions * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. - * @initializer: Initializer expression (e.g., pass `= { }` at minimum). + * @trailer: Trailing expressions for attributes and/or initializers. */ -#define _DEFINE_FLEX(type, name, member, count, initializer...) \ +#define __DEFINE_FLEX(type, name, member, count, trailer...) \ _Static_assert(__builtin_constant_p(count), \ "onstack flex array members require compile-time const count"); \ union { \ u8 bytes[struct_size_t(type, member, count)]; \ type obj; \ - } name##_u = { .obj initializer }; \ + } name##_u trailer; \ type *name = (type *)&name##_u +/** + * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. + * Enables caller macro to pass (different) initializer. + * + * @type: structure type name, including "struct" keyword. + * @name: Name for a variable to define. + * @member: Name of the array member. + * @count: Number of elements in the array; must be compile-time const. + * @initializer: Initializer expression (e.g., pass `= { }` at minimum). + */ +#define _DEFINE_FLEX(type, name, member, count, initializer...) \ + __DEFINE_FLEX(type, name, member, count, = { .obj initializer }) + /** * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing * flexible array member, when it does not have a __counted_by annotation. @@ -424,7 +437,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * elements in array @member. */ #define DEFINE_RAW_FLEX(type, name, member, count) \ - _DEFINE_FLEX(type, name, member, count, = {}) + __DEFINE_FLEX(type, name, member, count, = { }) /** * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing -- cgit v1.2.3 From 1c8a0ed2043c30cee97facd1eb8cff88b6c7ea4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 7 Apr 2025 13:12:14 +0300 Subject: PCI: Remove unused pci_printk() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit include/linux/pci.h provides low-level pci_printk() interface that is not used since the commits fab874e12593 ("PCI/AER: Descope pci_printk() to aer_printk()") and 588021b28642 ("PCI: shpchp: Remove 'shpchp_debug' module parameter"). PCI logging should not use pci_printk() but pci_*() wrappers that follow the usual logging wrapper patterns. Remove pci_printk(). Signed-off-by: Ilpo Järvinen Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20250407101215.1376-1-ilpo.jarvinen@linux.intel.com --- include/linux/pci.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0e8e3fd77e96..e293ad5d840d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2694,9 +2694,6 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #include -#define pci_printk(level, pdev, fmt, arg...) \ - dev_printk(level, &(pdev)->dev, fmt, ##arg) - #define pci_emerg(pdev, fmt, arg...) dev_emerg(&(pdev)->dev, fmt, ##arg) #define pci_alert(pdev, fmt, arg...) dev_alert(&(pdev)->dev, fmt, ##arg) #define pci_crit(pdev, fmt, arg...) dev_crit(&(pdev)->dev, fmt, ##arg) -- cgit v1.2.3 From b8628379957d0c8c057782ec1a8512de6d4ba29c Mon Sep 17 00:00:00 2001 From: Sumanth Gavini Date: Mon, 19 May 2025 19:08:05 -0700 Subject: mfd: maxim: Correct Samsung "Electronics" spelling in headers Fix the misspelling of 'Electronics' in MFD driver headers. Link: https://lore.kernel.org/lkml/3aa30119-60e5-4dcb-b13a-1753966ca775@sirena.org.uk/#t Link: https://lore.kernel.org/r/20250520020808.159586-1-sumanth.gavini@yahoo.com Signed-off-by: Sumanth Gavini Signed-off-by: Lee Jones --- include/linux/mfd/max8997-private.h | 2 +- include/linux/mfd/max8998-private.h | 2 +- include/linux/mfd/max8998.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max8997-private.h b/include/linux/mfd/max8997-private.h index f70eea0f2264..261c0aae7d00 100644 --- a/include/linux/mfd/max8997-private.h +++ b/include/linux/mfd/max8997-private.h @@ -2,7 +2,7 @@ /* * max8997-private.h - Voltage regulator driver for the Maxim 8997 * - * Copyright (C) 2010 Samsung Electrnoics + * Copyright (C) 2010 Samsung Electronics * MyungJoo Ham */ diff --git a/include/linux/mfd/max8998-private.h b/include/linux/mfd/max8998-private.h index 6deb5f577602..d77dc18db6eb 100644 --- a/include/linux/mfd/max8998-private.h +++ b/include/linux/mfd/max8998-private.h @@ -2,7 +2,7 @@ /* * max8998-private.h - Voltage regulator driver for the Maxim 8998 * - * Copyright (C) 2009-2010 Samsung Electrnoics + * Copyright (C) 2009-2010 Samsung Electronics * Kyungmin Park * Marek Szyprowski */ diff --git a/include/linux/mfd/max8998.h b/include/linux/mfd/max8998.h index a054e55c8646..5473f1983e31 100644 --- a/include/linux/mfd/max8998.h +++ b/include/linux/mfd/max8998.h @@ -2,7 +2,7 @@ /* * max8998.h - Voltage regulator driver for the Maxim 8998 * - * Copyright (C) 2009-2010 Samsung Electrnoics + * Copyright (C) 2009-2010 Samsung Electronics * Kyungmin Park * Marek Szyprowski */ -- cgit v1.2.3 From ffb006aa433e8109ec79320c344fb69947997ba1 Mon Sep 17 00:00:00 2001 From: Sumanth Gavini Date: Mon, 19 May 2025 16:20:23 -0700 Subject: mfd: maxim: Correct Samsung "Electronics" spelling in copyright headers Fix the misspelling of 'Electronics' in MFD driver copyright headers. Link: https://lore.kernel.org/lkml/3aa30119-60e5-4dcb-b13a-1753966ca775@sirena.org.uk/#t Link: https://lore.kernel.org/r/20250519232025.152769-1-sumanth.gavini@yahoo.com Signed-off-by: Sumanth Gavini Signed-off-by: Lee Jones --- include/linux/mfd/max14577-private.h | 2 +- include/linux/mfd/max14577.h | 2 +- include/linux/mfd/max77686-private.h | 2 +- include/linux/mfd/max77686.h | 2 +- include/linux/mfd/max77693-private.h | 2 +- include/linux/mfd/max77693.h | 2 +- include/linux/mfd/max8997.h | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max14577-private.h b/include/linux/mfd/max14577-private.h index a21374f8ad26..dd51a37fa37f 100644 --- a/include/linux/mfd/max14577-private.h +++ b/include/linux/mfd/max14577-private.h @@ -2,7 +2,7 @@ /* * max14577-private.h - Common API for the Maxim 14577/77836 internal sub chip * - * Copyright (C) 2014 Samsung Electrnoics + * Copyright (C) 2014 Samsung Electronics * Chanwoo Choi * Krzysztof Kozlowski */ diff --git a/include/linux/mfd/max14577.h b/include/linux/mfd/max14577.h index 8b3ef891ba42..0fda5c2e745a 100644 --- a/include/linux/mfd/max14577.h +++ b/include/linux/mfd/max14577.h @@ -2,7 +2,7 @@ /* * max14577.h - Driver for the Maxim 14577/77836 * - * Copyright (C) 2014 Samsung Electrnoics + * Copyright (C) 2014 Samsung Electronics * Chanwoo Choi * Krzysztof Kozlowski * diff --git a/include/linux/mfd/max77686-private.h b/include/linux/mfd/max77686-private.h index ea635d12a741..e6b8b4014dc0 100644 --- a/include/linux/mfd/max77686-private.h +++ b/include/linux/mfd/max77686-private.h @@ -2,7 +2,7 @@ /* * max77686-private.h - Voltage regulator driver for the Maxim 77686/802 * - * Copyright (C) 2012 Samsung Electrnoics + * Copyright (C) 2012 Samsung Electronics * Chiwoong Byun */ diff --git a/include/linux/mfd/max77686.h b/include/linux/mfd/max77686.h index d0fb510875e6..7c4624acd1db 100644 --- a/include/linux/mfd/max77686.h +++ b/include/linux/mfd/max77686.h @@ -2,7 +2,7 @@ /* * max77686.h - Driver for the Maxim 77686/802 * - * Copyright (C) 2012 Samsung Electrnoics + * Copyright (C) 2012 Samsung Electronics * Chiwoong Byun * * This driver is based on max8997.h diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h index c324d548619e..8e7c35b5ea1c 100644 --- a/include/linux/mfd/max77693-private.h +++ b/include/linux/mfd/max77693-private.h @@ -2,7 +2,7 @@ /* * max77693-private.h - Voltage regulator driver for the Maxim 77693 * - * Copyright (C) 2012 Samsung Electrnoics + * Copyright (C) 2012 Samsung Electronics * SangYoung Son * * This program is not provided / owned by Maxim Integrated Products. diff --git a/include/linux/mfd/max77693.h b/include/linux/mfd/max77693.h index c67c16ba8649..8e77ebeb7cf1 100644 --- a/include/linux/mfd/max77693.h +++ b/include/linux/mfd/max77693.h @@ -2,7 +2,7 @@ /* * max77693.h - Driver for the Maxim 77693 * - * Copyright (C) 2012 Samsung Electrnoics + * Copyright (C) 2012 Samsung Electronics * SangYoung Son * * This program is not provided / owned by Maxim Integrated Products. diff --git a/include/linux/mfd/max8997.h b/include/linux/mfd/max8997.h index 5c2cc1103437..fb36e1386069 100644 --- a/include/linux/mfd/max8997.h +++ b/include/linux/mfd/max8997.h @@ -2,7 +2,7 @@ /* * max8997.h - Driver for the Maxim 8997/8966 * - * Copyright (C) 2009-2010 Samsung Electrnoics + * Copyright (C) 2009-2010 Samsung Electronics * MyungJoo Ham * * This driver is based on max8998.h -- cgit v1.2.3 From 10f4a7cd724e34b7a6ff96e57ac49dc0cadececc Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 20 May 2025 13:20:37 -0700 Subject: nvme: fix command limits status code The command specific status code, 0x183, was introduced in the NVMe 2.0 specification defined to "Command Size Limits Exceeded" and only ever applied to DSM and Copy commands. Fix the name and, remove the incorrect translation to error codes and special treatment in the target code for it. Fixes: 3b7c33b28a44d4 ("nvme.h: add Write Zeroes definitions") Cc: Chaitanya Kulkarni Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 51308f65b72f..b65a1b9f2116 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -2171,7 +2171,7 @@ enum { NVME_SC_BAD_ATTRIBUTES = 0x180, NVME_SC_INVALID_PI = 0x181, NVME_SC_READ_ONLY = 0x182, - NVME_SC_ONCS_NOT_SUPPORTED = 0x183, + NVME_SC_CMD_SIZE_LIM_EXCEEDED = 0x183, /* * I/O Command Set Specific - Fabrics commands: -- cgit v1.2.3 From 434d7f9b0e24e1f0166d05f10881a8ab386845b7 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 30 May 2025 16:30:11 +0800 Subject: timens: Add struct seq_file forward declaration Add forward declaration of struct seq_file before using it in a function prototype. Fixes: 04a8682a71be ("fs/proc: Introduce /proc/pid/timens_offsets") Signed-off-by: Herbert Xu Signed-off-by: Thomas Gleixner Acked-by: Andrei Vagin Link: https://lore.kernel.org/all/aDlskzKIAULMlwPj@gondor.apana.org.au --- include/linux/time_namespace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 0b8b32bf0655..bb2c52f4fc94 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -12,6 +12,7 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct seq_file; struct vm_area_struct; struct timens_offsets { -- cgit v1.2.3 From 1e1f706fc2ce90eaaf3480b3d5f27885960d751c Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Tue, 3 Jun 2025 15:35:38 +1000 Subject: wifi: cfg80211/mac80211: correctly parse S1G beacon optional elements S1G beacons are not traditional beacons but a type of extension frame. Extension frames contain the frame control and duration fields, followed by zero or more optional fields before the frame body. These optional fields are distinct from the variable length elements. The presence of optional fields is indicated in the frame control field. To correctly locate the elements offset, the frame control must be parsed to identify which optional fields are present. Currently, mac80211 parses S1G beacons based on fixed assumptions about the frame layout, without inspecting the frame control field. This can result in incorrect offsets to the "variable" portion of the frame. Properly parse S1G beacon frames by using the field lengths defined in IEEE 802.11-2024, section 9.3.4.3, ensuring that the elements offset is calculated accurately. Fixes: 9eaffe5078ca ("cfg80211: convert S1G beacon to scan results") Fixes: cd418ba63f0c ("mac80211: convert S1G beacon to scan results") Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250603053538.468562-1-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 79 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 420c7f9aa6ee..ce377f7fb912 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -111,6 +111,8 @@ /* bits unique to S1G beacon */ #define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 +#define IEEE80211_S1G_BCN_CSSID 0x200 +#define IEEE80211_S1G_BCN_ANO 0x400 /* see 802.11ah-2016 9.9 NDP CMAC frames */ #define IEEE80211_S1G_1MHZ_NDP_BITS 25 @@ -153,9 +155,6 @@ #define IEEE80211_ANO_NETTYPE_WILD 15 -/* bits unique to S1G beacon */ -#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 - /* control extension - for IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTL_EXT */ #define IEEE80211_CTL_EXT_POLL 0x2000 #define IEEE80211_CTL_EXT_SPR 0x3000 @@ -627,6 +626,42 @@ static inline bool ieee80211_is_s1g_beacon(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); } +/** + * ieee80211_s1g_has_next_tbtt - check if IEEE80211_S1G_BCN_NEXT_TBTT + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * next TBTT field + */ +static inline bool ieee80211_s1g_has_next_tbtt(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT)); +} + +/** + * ieee80211_s1g_has_ano - check if IEEE80211_S1G_BCN_ANO + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * ANO field + */ +static inline bool ieee80211_s1g_has_ano(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_ANO)); +} + +/** + * ieee80211_s1g_has_cssid - check if IEEE80211_S1G_BCN_CSSID + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * compressed SSID field + */ +static inline bool ieee80211_s1g_has_cssid(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID)); +} + /** * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon * @fc: frame control bytes in little-endian byteorder @@ -1245,16 +1280,40 @@ struct ieee80211_ext { u8 change_seq; u8 variable[0]; } __packed s1g_beacon; - struct { - u8 sa[ETH_ALEN]; - __le32 timestamp; - u8 change_seq; - u8 next_tbtt[3]; - u8 variable[0]; - } __packed s1g_short_beacon; } u; } __packed __aligned(2); +/** + * ieee80211_s1g_optional_len - determine length of optional S1G beacon fields + * @fc: frame control bytes in little-endian byteorder + * Return: total length in bytes of the optional fixed-length fields + * + * S1G beacons may contain up to three optional fixed-length fields that + * precede the variable-length elements. Whether these fields are present + * is indicated by flags in the frame control field. + * + * From IEEE 802.11-2024 section 9.3.4.3: + * - Next TBTT field may be 0 or 3 bytes + * - Short SSID field may be 0 or 4 bytes + * - Access Network Options (ANO) field may be 0 or 1 byte + */ +static inline size_t +ieee80211_s1g_optional_len(__le16 fc) +{ + size_t len = 0; + + if (ieee80211_s1g_has_next_tbtt(fc)) + len += 3; + + if (ieee80211_s1g_has_cssid(fc)) + len += 4; + + if (ieee80211_s1g_has_ano(fc)) + len += 1; + + return len; +} + #define IEEE80211_TWT_CONTROL_NDP BIT(0) #define IEEE80211_TWT_CONTROL_RESP_MODE BIT(1) #define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST BIT(3) -- cgit v1.2.3 From 500e626c4a5bf2fa7cc6f1cd6dd86c77b5b127f2 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:28 +0800 Subject: kernel: ftrace: export ftrace_sync_ipi The following ftrace patch for riscv uses a data store to update ftrace function. Therefore, a romote fence is required to order it against function_trace_op updates. The mechanism is similar to the fence between function_trace_op and update_ftrace_func in the generic ftrace, so we leverage the same ftrace_sync_ipi function. [ alex: Fix build warning when !CONFIG_DYNAMIC_FTRACE ] Signed-off-by: Andy Chiu Link: https://lore.kernel.org/r/20250407180838.42877-4-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- include/linux/ftrace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fbabc3d848b3..30374478cb07 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -635,6 +635,8 @@ enum { #define ftrace_get_symaddr(fentry_ip) (0) #endif +void ftrace_sync_ipi(void *data); + #ifdef CONFIG_DYNAMIC_FTRACE void ftrace_arch_code_modify_prepare(void); -- cgit v1.2.3 From 8c21c4111128365f81a88573eeb2844fa696b299 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 2 Jun 2025 19:55:36 +0900 Subject: module: make __mod_device_table__* symbols static The __mod_device_table__* symbols are only parsed by modpost to generate MODULE_ALIAS() entries from MODULE_DEVICE_TABLE(). Therefore, these symbols do not need to be globally visible, or globally unique. If they are in the global scope, we would worry about the symbol uniqueness, but modpost is fine with parsing multiple symbols with the same name. Signed-off-by: Masahiro Yamada Reviewed-by: Petr Pavlu --- include/linux/module.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 8050f77c3b64..92e1420fccdf 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -249,8 +249,8 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); #ifdef MODULE /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ -extern typeof(name) __mod_device_table__##type##__##name \ - __attribute__ ((unused, alias(__stringify(name)))) +static typeof(name) __mod_device_table__##type##__##name \ + __attribute__ ((used, alias(__stringify(name)))) #else /* !MODULE */ #define MODULE_DEVICE_TABLE(type, name) #endif -- cgit v1.2.3 From 6093faaf9593fca92f96f165c95ff4b53353b1f4 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Wed, 5 Mar 2025 16:37:06 +0800 Subject: raid6: Add RISC-V SIMD syndrome and recovery calculations The assembly is originally based on the ARM NEON and int.uc, but uses RISC-V vector instructions to implement the RAID6 syndrome and recovery calculations. The functions are tested on QEMU running with the option "-icount shift=0": raid6: rvvx1 gen() 1008 MB/s raid6: rvvx2 gen() 1395 MB/s raid6: rvvx4 gen() 1584 MB/s raid6: rvvx8 gen() 1694 MB/s raid6: int64x8 gen() 113 MB/s raid6: int64x4 gen() 116 MB/s raid6: int64x2 gen() 272 MB/s raid6: int64x1 gen() 229 MB/s raid6: using algorithm rvvx8 gen() 1694 MB/s raid6: .... xor() 1000 MB/s, rmw enabled raid6: using rvv recovery algorithm [Charlie: - Fixup vector options] Signed-off-by: Charlie Jenkins Signed-off-by: Chunyan Zhang Reviewed-by: Charlie Jenkins Tested-by: Charlie Jenkins Link: https://lore.kernel.org/r/20250305083707.74218-1-zhangchunyan@iscas.ac.cn Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- include/linux/raid/pq.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 98030accf641..72ff44cca864 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -108,6 +108,10 @@ extern const struct raid6_calls raid6_vpermxor4; extern const struct raid6_calls raid6_vpermxor8; extern const struct raid6_calls raid6_lsx; extern const struct raid6_calls raid6_lasx; +extern const struct raid6_calls raid6_rvvx1; +extern const struct raid6_calls raid6_rvvx2; +extern const struct raid6_calls raid6_rvvx4; +extern const struct raid6_calls raid6_rvvx8; struct raid6_recov_calls { void (*data2)(int, size_t, int, int, void **); @@ -125,6 +129,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc; extern const struct raid6_recov_calls raid6_recov_neon; extern const struct raid6_recov_calls raid6_recov_lsx; extern const struct raid6_recov_calls raid6_recov_lasx; +extern const struct raid6_recov_calls raid6_recov_rvv; extern const struct raid6_calls raid6_neonx1; extern const struct raid6_calls raid6_neonx2; -- cgit v1.2.3 From 044d2aee6c575231ed4a24fb3d119bad0937488b Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 21 May 2025 09:06:02 -0700 Subject: alloc_tag: handle module codetag load errors as module load failures Failures inside codetag_load_module() are currently ignored. As a result an error there would not cause a module load failure and freeing of the associated resources. Correct this behavior by propagating the error code to the caller and handling possible errors. With this change, error to allocate percpu counters, which happens at this stage, will not be ignored and will cause a module load failure and freeing of resources. With this change we also do not need to disable memory allocation profiling when this error happens, instead we fail to load the module. Link: https://lkml.kernel.org/r/20250521160602.1940771-1-surenb@google.com Fixes: 10075262888b ("alloc_tag: allocate percpu counters for module tags dynamically") Signed-off-by: Suren Baghdasaryan Reported-by: Casey Chen Closes: https://lore.kernel.org/all/20250520231620.15259-1-cachen@purestorage.com/ Cc: Daniel Gomez Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Luis Chamberalin Cc: Petr Pavlu Cc: Sami Tolvanen Cc: Signed-off-by: Andrew Morton --- include/linux/codetag.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/codetag.h b/include/linux/codetag.h index 0ee4c21c6dbc..5f2b9a1f722c 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -36,8 +36,8 @@ union codetag_ref { struct codetag_type_desc { const char *section; size_t tag_size; - void (*module_load)(struct module *mod, - struct codetag *start, struct codetag *end); + int (*module_load)(struct module *mod, + struct codetag *start, struct codetag *end); void (*module_unload)(struct module *mod, struct codetag *start, struct codetag *end); #ifdef CONFIG_MODULES @@ -89,7 +89,7 @@ void *codetag_alloc_module_section(struct module *mod, const char *name, unsigned long align); void codetag_free_module_sections(struct module *mod); void codetag_module_replaced(struct module *mod, struct module *new_mod); -void codetag_load_module(struct module *mod); +int codetag_load_module(struct module *mod); void codetag_unload_module(struct module *mod); #else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ @@ -103,7 +103,7 @@ codetag_alloc_module_section(struct module *mod, const char *name, unsigned long align) { return NULL; } static inline void codetag_free_module_sections(struct module *mod) {} static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {} -static inline void codetag_load_module(struct module *mod) {} +static inline int codetag_load_module(struct module *mod) { return 0; } static inline void codetag_unload_module(struct module *mod) {} #endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ -- cgit v1.2.3 From 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 27 May 2025 23:23:53 +0200 Subject: mm/hugetlb: unshare page tables during VMA split, not before Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split. Fix it by explicitly calling into the hugetlb unshare logic from __split_vma() in the same place where THP splitting also happens. At that point, both the VMA and the rmap(s) are write-locked. An annoying detail is that we can now call into the helper hugetlb_unshare_pmds() from two different locking contexts: 1. from hugetlb_split(), holding: - mmap lock (exclusively) - VMA lock - file rmap lock (exclusively) 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to call us with only the mmap lock held (in shared mode), but currently only runs while holding mmap lock (exclusively) and VMA lock Backporting note: This commit fixes a racy protection that was introduced in commit b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that commit claimed to fix an issue introduced in 5.13, but it should actually also go all the way back. [jannh@google.com: v2] Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn Cc: Liam Howlett Reviewed-by: Lorenzo Stoakes Reviewed-by: Oscar Salvador Cc: Lorenzo Stoakes Cc: Vlastimil Babka Cc: [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs] Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0598f36931de..42f374e828a2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -279,6 +279,7 @@ bool is_hugetlb_entry_migration(pte_t pte); bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); #else /* !CONFIG_HUGETLB_PAGE */ @@ -476,6 +477,8 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) { } +static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write -- cgit v1.2.3 From bab77c0d191e241d2d59a845c7ed68bfa6e1b257 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 May 2025 13:28:37 -0400 Subject: finish_automount(): don't leak MNT_LOCKED from parent to child Intention for MNT_LOCKED had always been to protect the internal mountpoints within a subtree that got copied across the userns boundary, not the mountpoint that tree got attached to - after all, it _was_ exposed before the copying. For roots of secondary copies that is enforced in attach_recursive_mnt() - MNT_LOCKED is explicitly stripped for those. For the root of primary copy we are almost always guaranteed that MNT_LOCKED won't be there, so attach_recursive_mnt() doesn't bother. Unfortunately, one call chain got overlooked - triggering e.g. NFS referral will have the submount inherit the public flags from parent; that's fine for such things as read-only, nosuid, etc., but not for MNT_LOCKED. This is particularly pointless since the mount attached by finish_automount() is usually expirable, which makes any protection granted by MNT_LOCKED null and void; just wait for a while and that mount will go away on its own. Include MNT_LOCKED into the set of flags to be ignored by do_add_mount() - it really is an internal flag. Reviewed-by: Christian Brauner Fixes: 5ff9d8a65ce8 ("vfs: Lock in place mounts from more privileged users") Signed-off-by: Al Viro --- include/linux/mount.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 6904ad33ee7a..1a3136e53eaa 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -65,7 +65,8 @@ enum mount_flags { MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED, + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | + MNT_LOCKED, }; struct vfsmount { -- cgit v1.2.3 From 41cb08555c4164996d67c78b3bf1c658075b75f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 May 2025 07:51:14 +0200 Subject: treewide, timers: Rename from_timer() to timer_container_of() Move this API to the canonical timer_*() namespace. [ tglx: Redone against pre rc1 ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/aB2X0jCKQO56WdMt@gmail.com --- include/linux/timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index f636f55c427d..0414d9e6b4fc 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -129,7 +129,7 @@ extern void timer_destroy_on_stack(struct timer_list *timer); static inline void timer_destroy_on_stack(struct timer_list *timer) { } #endif -#define from_timer(var, callback_timer, timer_fieldname) \ +#define timer_container_of(var, callback_timer, timer_fieldname) \ container_of(callback_timer, typeof(*var), timer_fieldname) /** -- cgit v1.2.3 From 1a4eabf662543c62ae1e71a26d1c8e6643c66388 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 1 Jul 2025 15:32:01 +0100 Subject: mfd: adp5585: Refactor how regmap defaults are handled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only thing changing between variants is the regmap default registers. Hence, instead of having a regmap configuration for every variant (duplicating lots of fields), add a chip info type of structure with a regmap ID to identify which defaults to use and populate regmap_config at runtime given a template plus the id. Also note that between variants, the defaults can be the same which means the chip info structure can be used in more than one compatible. This will also make it simpler adding new chips with more variants. Also note that the chip info structures are deliberately not const as they will also contain lots of members that are the same between the different devices variants and so we will fill those at runtime. Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-6-b1fcfe9e9826@analog.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h index 016033cd68e4..c56af8d8d76c 100644 --- a/include/linux/mfd/adp5585.h +++ b/include/linux/mfd/adp5585.h @@ -119,8 +119,19 @@ struct regmap; +enum adp5585_variant { + ADP5585_00 = 1, + ADP5585_01, + ADP5585_02, + ADP5585_03, + ADP5585_04, + ADP5585_MAX +}; + struct adp5585_dev { + struct device *dev; struct regmap *regmap; + enum adp5585_variant variant; }; #endif -- cgit v1.2.3 From 0190a72f28ee0995c546fd4fcf80ed25a0fc4b28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 1 Jul 2025 15:32:02 +0100 Subject: mfd: adp5585: Add support for adp5589 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ADP5589 is a 19 I/O port expander with built-in keypad matrix decoder, programmable logic, reset generator, and PWM generator. Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-7-b1fcfe9e9826@analog.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h index c56af8d8d76c..70e58122a36a 100644 --- a/include/linux/mfd/adp5585.h +++ b/include/linux/mfd/adp5585.h @@ -117,6 +117,12 @@ #define ADP5585_BANK(n) ((n) >= 6 ? 1 : 0) #define ADP5585_BIT(n) ((n) >= 6 ? BIT((n) - 6) : BIT(n)) +/* ADP5589 */ +#define ADP5589_MAN_ID_VALUE 0x10 +#define ADP5589_GPI_STATUS_C 0x18 +#define ADP5589_INT_EN 0x4e +#define ADP5589_MAX_REG ADP5589_INT_EN + struct regmap; enum adp5585_variant { @@ -125,6 +131,9 @@ enum adp5585_variant { ADP5585_02, ADP5585_03, ADP5585_04, + ADP5589_00, + ADP5589_01, + ADP5589_02, ADP5585_MAX }; @@ -132,6 +141,7 @@ struct adp5585_dev { struct device *dev; struct regmap *regmap; enum adp5585_variant variant; + unsigned int id; }; #endif -- cgit v1.2.3 From 7077fb501b95360c7fe35553f2bdb1ccf34edd16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 1 Jul 2025 15:32:03 +0100 Subject: mfd: adp5585: Add a per chip reg struture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are some differences in the register map between the devices. Hence, add a register structure per device. This will be needed in following patches. On top of that adp5585_fill_regmap_config() is renamed and reworked so that the current struct adp5585_info act as template (they indeed contain all the different data between variants) which can then be complemented depending on the device (as identified by the id register). This is done like this since a lot of the data is pretty much the same between variants of the same device. Reviewed-by: Lee Jones Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-8-b1fcfe9e9826@analog.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h index 70e58122a36a..6ecb90a6276c 100644 --- a/include/linux/mfd/adp5585.h +++ b/include/linux/mfd/adp5585.h @@ -120,6 +120,7 @@ /* ADP5589 */ #define ADP5589_MAN_ID_VALUE 0x10 #define ADP5589_GPI_STATUS_C 0x18 +#define ADP5589_PIN_CONFIG_D 0x4C #define ADP5589_INT_EN 0x4e #define ADP5589_MAX_REG ADP5589_INT_EN @@ -137,9 +138,14 @@ enum adp5585_variant { ADP5585_MAX }; +struct adp5585_regs { + unsigned int ext_cfg; +}; + struct adp5585_dev { struct device *dev; struct regmap *regmap; + const struct adp5585_regs *regs; enum adp5585_variant variant; unsigned int id; }; -- cgit v1.2.3 From 9f425bf713b511b1078e0fea5a88c497e13dbb64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 1 Jul 2025 15:32:04 +0100 Subject: gpio: adp5585: add support for the adp5589 expander MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support the adp5589 I/O expander which supports up to 19 pins. We need to add a chip_info based struct since accessing register "banks" and "bits" differs between devices. Also some register addresses are different. While at it move ADP558X_GPIO_MAX defines to the main header file and rename them. That information will be needed by the top level device in a following change. Acked-by: Linus Walleij Acked-by: Bartosz Golaszewski Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-9-b1fcfe9e9826@analog.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h index 6ecb90a6276c..d26f722cf31a 100644 --- a/include/linux/mfd/adp5585.h +++ b/include/linux/mfd/adp5585.h @@ -107,23 +107,23 @@ #define ADP5585_MAX_REG ADP5585_INT_EN -/* - * Bank 0 covers pins "GPIO 1/R0" to "GPIO 6/R5", numbered 0 to 5 by the - * driver, and bank 1 covers pins "GPIO 7/C0" to "GPIO 11/C4", numbered 6 to - * 10. Some variants of the ADP5585 don't support "GPIO 6/R5". As the driver - * uses identical GPIO numbering for all variants to avoid confusion, GPIO 5 is - * marked as reserved in the device tree for variants that don't support it. - */ -#define ADP5585_BANK(n) ((n) >= 6 ? 1 : 0) -#define ADP5585_BIT(n) ((n) >= 6 ? BIT((n) - 6) : BIT(n)) +#define ADP5585_PIN_MAX 11 /* ADP5589 */ #define ADP5589_MAN_ID_VALUE 0x10 +#define ADP5589_GPI_STATUS_A 0x16 #define ADP5589_GPI_STATUS_C 0x18 +#define ADP5589_RPULL_CONFIG_A 0x19 +#define ADP5589_DEBOUNCE_DIS_A 0x27 +#define ADP5589_GPO_DATA_OUT_A 0x2a +#define ADP5589_GPO_OUT_MODE_A 0x2d +#define ADP5589_GPIO_DIRECTION_A 0x30 #define ADP5589_PIN_CONFIG_D 0x4C #define ADP5589_INT_EN 0x4e #define ADP5589_MAX_REG ADP5589_INT_EN +#define ADP5589_PIN_MAX 19 + struct regmap; enum adp5585_variant { -- cgit v1.2.3 From 75024f97e82e63d02b0743500efb1e264a1c2dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 1 Jul 2025 15:32:05 +0100 Subject: pwm: adp5585: add support for adp5589 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the adp5589 I/O expander. From a PWM point of view it is pretty similar to adp5585. Main difference is the address of registers meaningful for configuring the PWM. Acked-by: Uwe Kleine-König Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-10-b1fcfe9e9826@analog.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h index d26f722cf31a..77f7c74f084d 100644 --- a/include/linux/mfd/adp5585.h +++ b/include/linux/mfd/adp5585.h @@ -118,6 +118,9 @@ #define ADP5589_GPO_DATA_OUT_A 0x2a #define ADP5589_GPO_OUT_MODE_A 0x2d #define ADP5589_GPIO_DIRECTION_A 0x30 +#define ADP5589_PWM_OFFT_LOW 0x3e +#define ADP5589_PWM_ONT_LOW 0x40 +#define ADP5589_PWM_CFG 0x42 #define ADP5589_PIN_CONFIG_D 0x4C #define ADP5589_INT_EN 0x4e #define ADP5589_MAX_REG ADP5589_INT_EN -- cgit v1.2.3 From 47a1f759b776ec9287f675f5d4fbf60b94cc566d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 1 Jul 2025 15:32:07 +0100 Subject: mfd: adp5585: Add support for event handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These devices are capable of generate FIFO based events based on KEY or GPI presses. Add support for handling these events. This is in preparation of adding full support for keymap and gpis based events. Reviewed-by: Lee Jones Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-12-b1fcfe9e9826@analog.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h index 77f7c74f084d..43a33a3d3f5a 100644 --- a/include/linux/mfd/adp5585.h +++ b/include/linux/mfd/adp5585.h @@ -10,13 +10,20 @@ #define __MFD_ADP5585_H_ #include +#include #define ADP5585_ID 0x00 #define ADP5585_MAN_ID_VALUE 0x20 #define ADP5585_MAN_ID_MASK GENMASK(7, 4) +#define ADP5585_REV_ID_MASK GENMASK(3, 0) #define ADP5585_INT_STATUS 0x01 +#define ADP5585_OVRFLOW_INT BIT(2) +#define ADP5585_EVENT_INT BIT(0) #define ADP5585_STATUS 0x02 +#define ADP5585_EC_MASK GENMASK(4, 0) #define ADP5585_FIFO_1 0x03 +#define ADP5585_KEV_EV_PRESS_MASK BIT(7) +#define ADP5585_KEY_EVENT_MASK GENMASK(6, 0) #define ADP5585_FIFO_2 0x04 #define ADP5585_FIFO_3 0x05 #define ADP5585_FIFO_4 0x06 @@ -32,6 +39,7 @@ #define ADP5585_FIFO_14 0x10 #define ADP5585_FIFO_15 0x11 #define ADP5585_FIFO_16 0x12 +#define ADP5585_EV_MAX (ADP5585_FIFO_16 - ADP5585_FIFO_1 + 1) #define ADP5585_GPI_INT_STAT_A 0x13 #define ADP5585_GPI_INT_STAT_B 0x14 #define ADP5585_GPI_STATUS_A 0x15 @@ -104,6 +112,8 @@ #define ADP5585_INT_CFG BIT(1) #define ADP5585_RST_CFG BIT(0) #define ADP5585_INT_EN 0x3c +#define ADP5585_OVRFLOW_IEN BIT(2) +#define ADP5585_EVENT_IEN BIT(0) #define ADP5585_MAX_REG ADP5585_INT_EN @@ -121,7 +131,9 @@ #define ADP5589_PWM_OFFT_LOW 0x3e #define ADP5589_PWM_ONT_LOW 0x40 #define ADP5589_PWM_CFG 0x42 +#define ADP5589_POLL_PTIME_CFG 0x48 #define ADP5589_PIN_CONFIG_D 0x4C +#define ADP5589_GENERAL_CFG 0x4d #define ADP5589_INT_EN 0x4e #define ADP5589_MAX_REG ADP5589_INT_EN @@ -142,15 +154,21 @@ enum adp5585_variant { }; struct adp5585_regs { + unsigned int gen_cfg; unsigned int ext_cfg; + unsigned int int_en; + unsigned int poll_ptime_cfg; }; struct adp5585_dev { struct device *dev; struct regmap *regmap; const struct adp5585_regs *regs; + struct blocking_notifier_head event_notifier; enum adp5585_variant variant; unsigned int id; + int irq; + unsigned int ev_poll_time; }; #endif -- cgit v1.2.3 From 333812da70d5f71bf5e176f6d55a5f716301b5fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 1 Jul 2025 15:32:08 +0100 Subject: mfd: adp5585: Support reset and unlock events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ADP558x family of devices can be programmed to respond to some especial events, In case of the unlock events, one can lock the keypad and use KEYS or GPIs events to unlock it. For the reset events, one can again use a combinations of GPIs/KEYs in order to generate an event that will trigger the device to generate an output reset pulse. Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-13-b1fcfe9e9826@analog.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h index 43a33a3d3f5a..db483ef9693a 100644 --- a/include/linux/mfd/adp5585.h +++ b/include/linux/mfd/adp5585.h @@ -68,6 +68,7 @@ #define ADP5585_GPIO_DIRECTION_A 0x27 #define ADP5585_GPIO_DIRECTION_B 0x28 #define ADP5585_RESET1_EVENT_A 0x29 +#define ADP5585_RESET_EV_PRESS BIT(7) #define ADP5585_RESET1_EVENT_B 0x2a #define ADP5585_RESET1_EVENT_C 0x2b #define ADP5585_RESET2_EVENT_A 0x2c @@ -118,6 +119,13 @@ #define ADP5585_MAX_REG ADP5585_INT_EN #define ADP5585_PIN_MAX 11 +#define ADP5585_MAX_UNLOCK_TIME_SEC 7 +#define ADP5585_KEY_EVENT_START 1 +#define ADP5585_KEY_EVENT_END 25 +#define ADP5585_GPI_EVENT_START 37 +#define ADP5585_GPI_EVENT_END 47 +#define ADP5585_ROW5_KEY_EVENT_START 1 +#define ADP5585_ROW5_KEY_EVENT_END 30 /* ADP5589 */ #define ADP5589_MAN_ID_VALUE 0x10 @@ -128,6 +136,20 @@ #define ADP5589_GPO_DATA_OUT_A 0x2a #define ADP5589_GPO_OUT_MODE_A 0x2d #define ADP5589_GPIO_DIRECTION_A 0x30 +#define ADP5589_UNLOCK1 0x33 +#define ADP5589_UNLOCK_EV_PRESS BIT(7) +#define ADP5589_UNLOCK_TIMERS 0x36 +#define ADP5589_UNLOCK_TIMER GENMASK(2, 0) +#define ADP5589_LOCK_CFG 0x37 +#define ADP5589_LOCK_EN BIT(0) +#define ADP5589_RESET1_EVENT_A 0x38 +#define ADP5589_RESET2_EVENT_A 0x3B +#define ADP5589_RESET_CFG 0x3D +#define ADP5585_RESET2_POL BIT(7) +#define ADP5585_RESET1_POL BIT(6) +#define ADP5585_RST_PASSTHRU_EN BIT(5) +#define ADP5585_RESET_TRIG_TIME GENMASK(4, 2) +#define ADP5585_PULSE_WIDTH GENMASK(1, 0) #define ADP5589_PWM_OFFT_LOW 0x3e #define ADP5589_PWM_ONT_LOW 0x40 #define ADP5589_PWM_CFG 0x42 @@ -138,6 +160,11 @@ #define ADP5589_MAX_REG ADP5589_INT_EN #define ADP5589_PIN_MAX 19 +#define ADP5589_KEY_EVENT_START 1 +#define ADP5589_KEY_EVENT_END 88 +#define ADP5589_GPI_EVENT_START 97 +#define ADP5589_GPI_EVENT_END 115 +#define ADP5589_UNLOCK_WILDCARD 127 struct regmap; @@ -158,6 +185,9 @@ struct adp5585_regs { unsigned int ext_cfg; unsigned int int_en; unsigned int poll_ptime_cfg; + unsigned int reset_cfg; + unsigned int reset1_event_a; + unsigned int reset2_event_a; }; struct adp5585_dev { @@ -167,8 +197,18 @@ struct adp5585_dev { struct blocking_notifier_head event_notifier; enum adp5585_variant variant; unsigned int id; + bool has_unlock; + bool has_pin6; int irq; unsigned int ev_poll_time; + unsigned int unlock_time; + unsigned int unlock_keys[2]; + unsigned int nkeys_unlock; + unsigned int reset1_keys[3]; + unsigned int nkeys_reset1; + unsigned int reset2_keys[2]; + unsigned int nkeys_reset2; + u8 reset_cfg; }; #endif -- cgit v1.2.3 From bd113a13e1fa51789f55987369b80e1d8bc19389 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 1 Jul 2025 15:32:09 +0100 Subject: mfd: adp5585: Add support for input devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ADP558x family supports a built in keypad matrix decoder which can be added as an Input device. In order to both support the Input and the GPIO device, we need to create a bitmap of the supported pins and track their usage since they can either be used as GPIOs (GPIs) or as part of the keymap. We also need to mark special pins busy in case some features are being used (ex: pwm or reset events). Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-14-b1fcfe9e9826@analog.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h index db483ef9693a..41c5d2e1cc7c 100644 --- a/include/linux/mfd/adp5585.h +++ b/include/linux/mfd/adp5585.h @@ -126,6 +126,10 @@ #define ADP5585_GPI_EVENT_END 47 #define ADP5585_ROW5_KEY_EVENT_START 1 #define ADP5585_ROW5_KEY_EVENT_END 30 +#define ADP5585_PWM_OUT 3 +#define ADP5585_RESET1_OUT 4 +#define ADP5585_RESET2_OUT 9 +#define ADP5585_ROW5 5 /* ADP5589 */ #define ADP5589_MAN_ID_VALUE 0x10 @@ -154,6 +158,7 @@ #define ADP5589_PWM_ONT_LOW 0x40 #define ADP5589_PWM_CFG 0x42 #define ADP5589_POLL_PTIME_CFG 0x48 +#define ADP5589_PIN_CONFIG_A 0x49 #define ADP5589_PIN_CONFIG_D 0x4C #define ADP5589_GENERAL_CFG 0x4d #define ADP5589_INT_EN 0x4e @@ -165,6 +170,7 @@ #define ADP5589_GPI_EVENT_START 97 #define ADP5589_GPI_EVENT_END 115 #define ADP5589_UNLOCK_WILDCARD 127 +#define ADP5589_RESET2_OUT 12 struct regmap; @@ -188,6 +194,7 @@ struct adp5585_regs { unsigned int reset_cfg; unsigned int reset1_event_a; unsigned int reset2_event_a; + unsigned int pin_cfg_a; }; struct adp5585_dev { @@ -195,6 +202,9 @@ struct adp5585_dev { struct regmap *regmap; const struct adp5585_regs *regs; struct blocking_notifier_head event_notifier; + unsigned long *pin_usage; + unsigned int n_pins; + unsigned int reset2_out; enum adp5585_variant variant; unsigned int id; bool has_unlock; -- cgit v1.2.3 From 988b28a83b658137e58123f4dafc3a1588e1cb2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 1 Jul 2025 15:32:10 +0100 Subject: gpio: adp5585: support gpi events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for adding GPIs to the event FIFO. This is done by adding irq_chip support. Like this, one can use the input gpio_keys driver as a "frontend" device and input handler. As part of this change, we now implement .request() and .free() as we can't blindly consume all available pins as GPIOs (example: some pins can be used for forming a keymap matrix). Also note that the number of pins can now be obtained from the parent, top level device. Hence the 'max_gpio' variable can be removed. Reviewed-by: Linus Walleij Acked-by: Bartosz Golaszewski Signed-off-by: Nuno Sá Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-15-b1fcfe9e9826@analog.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h index 41c5d2e1cc7c..5237da6b4a9f 100644 --- a/include/linux/mfd/adp5585.h +++ b/include/linux/mfd/adp5585.h @@ -136,6 +136,8 @@ #define ADP5589_GPI_STATUS_A 0x16 #define ADP5589_GPI_STATUS_C 0x18 #define ADP5589_RPULL_CONFIG_A 0x19 +#define ADP5589_GPI_INT_LEVEL_A 0x1e +#define ADP5589_GPI_EVENT_EN_A 0x21 #define ADP5589_DEBOUNCE_DIS_A 0x27 #define ADP5589_GPO_DATA_OUT_A 0x2a #define ADP5589_GPO_OUT_MODE_A 0x2d -- cgit v1.2.3 From ea4d331050b4cd43e6a900937db88b01ef75e1f2 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Wed, 16 Oct 2024 06:02:41 +0200 Subject: Input: touch-overlay - add touchscreen overlay handling Some touch devices provide mechanical overlays with different objects like buttons or clipped touchscreen surfaces. In order to support these objects, add a series of helper functions to the input subsystem to transform them into overlay objects via device tree nodes. These overlay objects consume the raw touch events and report the expected input events depending on the object properties. Note that the current implementation allows for multiple definitions of touchscreen areas (regions that report touch events), but only the first one will be used for the touchscreen device that the consumers typically provide. Should the need for multiple touchscreen areas arise, additional touchscreen devices would be required at the consumer side. There is no limitation in the number of touch areas defined as buttons. Reviewed-by: Jeff LaBundy Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20241016-feature-ts_virtobj_patch-v11-2-b292a1bbb0a1@wolfvision.net Signed-off-by: Dmitry Torokhov --- include/linux/input/touch-overlay.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 include/linux/input/touch-overlay.h (limited to 'include/linux') diff --git a/include/linux/input/touch-overlay.h b/include/linux/input/touch-overlay.h new file mode 100644 index 000000000000..0253e554d3cd --- /dev/null +++ b/include/linux/input/touch-overlay.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023 Javier Carrasco + */ + +#ifndef _TOUCH_OVERLAY +#define _TOUCH_OVERLAY + +#include + +struct input_dev; + +int touch_overlay_map(struct list_head *list, struct input_dev *input); + +void touch_overlay_get_touchscreen_abs(struct list_head *list, u16 *x, u16 *y); + +bool touch_overlay_mapped_touchscreen(struct list_head *list); + +bool touch_overlay_process_contact(struct list_head *list, + struct input_dev *input, + struct input_mt_pos *pos, int slot); + +void touch_overlay_sync_frame(struct list_head *list, struct input_dev *input); + +#endif -- cgit v1.2.3